diff --git a/Makefile b/Makefile
index e9e111758..fa49ec775 100644
--- a/Makefile
+++ b/Makefile
@@ -37,6 +37,7 @@ mkfiles:
 includes:
 	$(MAKE) -C include includes
 	$(MAKE) -C lib includes NBSD_LIBC=yes
+	$(MAKE) -C sys includes
 
 MKHEADERSS=/usr/pkg/gcc*/libexec/gcc/*/*/install-tools/mkheaders
 gnu-includes: includes
@@ -50,6 +51,7 @@ commands: includes libraries
 	$(MAKE) -C bin all
 	$(MAKE) -C sbin all
 	$(MAKE) -C usr.bin all
+	$(MAKE) -C external all
 	$(MAKE) -C libexec all
 	$(MAKE) -C usr.sbin all
 
@@ -59,6 +61,7 @@ dep-all:
 	$(MAKE) -C bin dependall
 	$(MAKE) -C sbin dependall
 	$(MAKE) -C usr.bin dependall
+	$(MAKE) -C external dependall
 	$(MAKE) -C libexec dependall
 	$(MAKE) -C usr.sbin dependall
 	$(MAKE) -C kernel dependall
@@ -77,6 +80,7 @@ all:
 	$(MAKE) -C bin all
 	$(MAKE) -C sbin all
 	$(MAKE) -C usr.bin all
+	$(MAKE) -C external all
 	$(MAKE) -C libexec all
 	$(MAKE) -C usr.sbin all
 	$(MAKE) -C tools all
@@ -89,6 +93,7 @@ install:
 	$(MAKE) -C bin install
 	$(MAKE) -C sbin install
 	$(MAKE) -C usr.bin install
+	$(MAKE) -C external install
 	$(MAKE) -C usr.sbin install
 	$(MAKE) -C servers install
 	$(MAKE) -C share install
@@ -100,6 +105,7 @@ clean: mkfiles
 	$(MAKE) -C bin clean
 	$(MAKE) -C sbin clean
 	$(MAKE) -C usr.bin clean
+	$(MAKE) -C external clean
 	$(MAKE) -C libexec clean
 	$(MAKE) -C usr.sbin clean
 	$(MAKE) -C share clean
@@ -114,6 +120,7 @@ cleandepend: mkfiles
 	$(MAKE) -C bin cleandepend
 	$(MAKE) -C sbin cleandepend
 	$(MAKE) -C usr.bin cleandepend
+	$(MAKE) -C external cleandepend
 	$(MAKE) -C libexec cleandepend
 	$(MAKE) -C usr.sbin cleandepend
 	$(MAKE) -C tools cleandepend
diff --git a/common/include/arch/i386/disklabel.h b/common/include/arch/i386/disklabel.h
deleted file mode 100644
index e7d5246bc..000000000
--- a/common/include/arch/i386/disklabel.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*	$NetBSD: disklabel.h,v 1.16 2011/08/30 12:39:55 bouyer Exp $	*/
-
-/*
- * Copyright (c) 1994 Christopher G. Demetriou
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *      This product includes software developed by Christopher G. Demetriou.
- * 4. The name of the author may not be used to endorse or promote products
- *    derived from this software without specific prior written permission
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _I386_DISKLABEL_H_
-#define _I386_DISKLABEL_H_
-
-#define LABELUSESMBR		1	/* use MBR partitionning */
-#define	LABELSECTOR		1	/* sector containing label */
-#define	LABELOFFSET		0	/* offset of label in sector */
-#define	MAXPARTITIONS		16	/* number of partitions */
-#define	OLDMAXPARTITIONS 	8	/* number of partitions before 1.6 */
-#define	RAW_PART		3	/* raw partition: XX?d (XXX) */
-
-/*
- * We use the highest bit of the minor number for the partition number.
- * This maintains backward compatibility with device nodes created before
- * MAXPARTITIONS was increased.
- */
-#define __I386_MAXDISKS	((1 << 20) / MAXPARTITIONS)
-#define DISKUNIT(dev)	((minor(dev) / OLDMAXPARTITIONS) % __I386_MAXDISKS)
-#define DISKPART(dev)	((minor(dev) % OLDMAXPARTITIONS) + \
-    ((minor(dev) / (__I386_MAXDISKS * OLDMAXPARTITIONS)) * OLDMAXPARTITIONS))
-#define	DISKMINOR(unit, part) \
-    (((unit) * OLDMAXPARTITIONS) + ((part) % OLDMAXPARTITIONS) + \
-     ((part) / OLDMAXPARTITIONS) * (__I386_MAXDISKS * OLDMAXPARTITIONS))
-
-/* Pull in MBR partition definitions. */
-#if HAVE_NBTOOL_CONFIG_H
-#include <nbinclude/sys/bootblock.h>
-#else
-#include <sys/bootblock.h>
-#endif /* HAVE_NBTOOL_CONFIG_H */
-
-#ifndef __ASSEMBLER__
-#if HAVE_NBTOOL_CONFIG_H
-#include <nbinclude/sys/dkbad.h>
-#else
-#include <sys/dkbad.h>
-#endif /* HAVE_NBTOOL_CONFIG_H */
-struct cpu_disklabel {
-#define __HAVE_DISKLABEL_DKBAD
-	struct dkbad bad;
-};
-#endif
-
-#endif /* _I386_DISKLABEL_H_ */
diff --git a/external/Makefile b/external/Makefile
new file mode 100644
index 000000000..4072adc24
--- /dev/null
+++ b/external/Makefile
@@ -0,0 +1,3 @@
+SUBDIR=bsd
+
+.include <bsd.subdir.mk>
diff --git a/external/bsd/Makefile b/external/bsd/Makefile
new file mode 100644
index 000000000..58740673e
--- /dev/null
+++ b/external/bsd/Makefile
@@ -0,0 +1,3 @@
+.include <bsd.own.mk>
+SUBDIR=mdocml
+.include <bsd.subdir.mk>
diff --git a/usr.bin/mdocml/Makefile b/external/bsd/mdocml/Makefile
similarity index 100%
rename from usr.bin/mdocml/Makefile
rename to external/bsd/mdocml/Makefile
diff --git a/usr.bin/mdocml/Makefile.inc b/external/bsd/mdocml/Makefile.inc
similarity index 95%
rename from usr.bin/mdocml/Makefile.inc
rename to external/bsd/mdocml/Makefile.inc
index 7368b890c..6dd59dd61 100644
--- a/usr.bin/mdocml/Makefile.inc
+++ b/external/bsd/mdocml/Makefile.inc
@@ -1,7 +1,6 @@
 # $NetBSD: Makefile.inc,v 1.12 2010/07/25 19:16:18 joerg Exp $
 
 .include <bsd.own.mk>
-.include "../Makefile.inc"
 
 VERSION!=	cd ${.PARSEDIR}/dist && ${MAKE} -V VERSION
 
diff --git a/usr.bin/mdocml/bin/Makefile b/external/bsd/mdocml/bin/Makefile
similarity index 100%
rename from usr.bin/mdocml/bin/Makefile
rename to external/bsd/mdocml/bin/Makefile
diff --git a/usr.bin/mdocml/bin/Makefile.inc b/external/bsd/mdocml/bin/Makefile.inc
similarity index 100%
rename from usr.bin/mdocml/bin/Makefile.inc
rename to external/bsd/mdocml/bin/Makefile.inc
diff --git a/usr.bin/mdocml/bin/mandoc/Makefile b/external/bsd/mdocml/bin/mandoc/Makefile
similarity index 100%
rename from usr.bin/mdocml/bin/mandoc/Makefile
rename to external/bsd/mdocml/bin/mandoc/Makefile
diff --git a/usr.bin/mdocml/dist/Makefile b/external/bsd/mdocml/dist/Makefile
similarity index 100%
rename from usr.bin/mdocml/dist/Makefile
rename to external/bsd/mdocml/dist/Makefile
diff --git a/usr.bin/mdocml/dist/arch.c b/external/bsd/mdocml/dist/arch.c
similarity index 100%
rename from usr.bin/mdocml/dist/arch.c
rename to external/bsd/mdocml/dist/arch.c
diff --git a/usr.bin/mdocml/dist/arch.in b/external/bsd/mdocml/dist/arch.in
similarity index 100%
rename from usr.bin/mdocml/dist/arch.in
rename to external/bsd/mdocml/dist/arch.in
diff --git a/usr.bin/mdocml/dist/att.c b/external/bsd/mdocml/dist/att.c
similarity index 100%
rename from usr.bin/mdocml/dist/att.c
rename to external/bsd/mdocml/dist/att.c
diff --git a/usr.bin/mdocml/dist/att.in b/external/bsd/mdocml/dist/att.in
similarity index 100%
rename from usr.bin/mdocml/dist/att.in
rename to external/bsd/mdocml/dist/att.in
diff --git a/usr.bin/mdocml/dist/chars.c b/external/bsd/mdocml/dist/chars.c
similarity index 100%
rename from usr.bin/mdocml/dist/chars.c
rename to external/bsd/mdocml/dist/chars.c
diff --git a/usr.bin/mdocml/dist/chars.h b/external/bsd/mdocml/dist/chars.h
similarity index 100%
rename from usr.bin/mdocml/dist/chars.h
rename to external/bsd/mdocml/dist/chars.h
diff --git a/usr.bin/mdocml/dist/chars.in b/external/bsd/mdocml/dist/chars.in
similarity index 100%
rename from usr.bin/mdocml/dist/chars.in
rename to external/bsd/mdocml/dist/chars.in
diff --git a/usr.bin/mdocml/dist/compat.c b/external/bsd/mdocml/dist/compat.c
similarity index 100%
rename from usr.bin/mdocml/dist/compat.c
rename to external/bsd/mdocml/dist/compat.c
diff --git a/usr.bin/mdocml/dist/config.h.post b/external/bsd/mdocml/dist/config.h.post
similarity index 100%
rename from usr.bin/mdocml/dist/config.h.post
rename to external/bsd/mdocml/dist/config.h.post
diff --git a/usr.bin/mdocml/dist/config.h.pre b/external/bsd/mdocml/dist/config.h.pre
similarity index 100%
rename from usr.bin/mdocml/dist/config.h.pre
rename to external/bsd/mdocml/dist/config.h.pre
diff --git a/usr.bin/mdocml/dist/example.style.css b/external/bsd/mdocml/dist/example.style.css
similarity index 100%
rename from usr.bin/mdocml/dist/example.style.css
rename to external/bsd/mdocml/dist/example.style.css
diff --git a/usr.bin/mdocml/dist/external.png.uu b/external/bsd/mdocml/dist/external.png.uu
similarity index 100%
rename from usr.bin/mdocml/dist/external.png.uu
rename to external/bsd/mdocml/dist/external.png.uu
diff --git a/usr.bin/mdocml/dist/html.c b/external/bsd/mdocml/dist/html.c
similarity index 100%
rename from usr.bin/mdocml/dist/html.c
rename to external/bsd/mdocml/dist/html.c
diff --git a/usr.bin/mdocml/dist/html.h b/external/bsd/mdocml/dist/html.h
similarity index 100%
rename from usr.bin/mdocml/dist/html.h
rename to external/bsd/mdocml/dist/html.h
diff --git a/usr.bin/mdocml/dist/lib.c b/external/bsd/mdocml/dist/lib.c
similarity index 100%
rename from usr.bin/mdocml/dist/lib.c
rename to external/bsd/mdocml/dist/lib.c
diff --git a/usr.bin/mdocml/dist/lib.in b/external/bsd/mdocml/dist/lib.in
similarity index 100%
rename from usr.bin/mdocml/dist/lib.in
rename to external/bsd/mdocml/dist/lib.in
diff --git a/usr.bin/mdocml/dist/libman.h b/external/bsd/mdocml/dist/libman.h
similarity index 100%
rename from usr.bin/mdocml/dist/libman.h
rename to external/bsd/mdocml/dist/libman.h
diff --git a/usr.bin/mdocml/dist/libmandoc.h b/external/bsd/mdocml/dist/libmandoc.h
similarity index 100%
rename from usr.bin/mdocml/dist/libmandoc.h
rename to external/bsd/mdocml/dist/libmandoc.h
diff --git a/usr.bin/mdocml/dist/libmdoc.h b/external/bsd/mdocml/dist/libmdoc.h
similarity index 100%
rename from usr.bin/mdocml/dist/libmdoc.h
rename to external/bsd/mdocml/dist/libmdoc.h
diff --git a/usr.bin/mdocml/dist/libroff.h b/external/bsd/mdocml/dist/libroff.h
similarity index 100%
rename from usr.bin/mdocml/dist/libroff.h
rename to external/bsd/mdocml/dist/libroff.h
diff --git a/usr.bin/mdocml/dist/main.c b/external/bsd/mdocml/dist/main.c
similarity index 100%
rename from usr.bin/mdocml/dist/main.c
rename to external/bsd/mdocml/dist/main.c
diff --git a/usr.bin/mdocml/dist/main.h b/external/bsd/mdocml/dist/main.h
similarity index 100%
rename from usr.bin/mdocml/dist/main.h
rename to external/bsd/mdocml/dist/main.h
diff --git a/usr.bin/mdocml/dist/man.3 b/external/bsd/mdocml/dist/man.3
similarity index 100%
rename from usr.bin/mdocml/dist/man.3
rename to external/bsd/mdocml/dist/man.3
diff --git a/usr.bin/mdocml/dist/man.7 b/external/bsd/mdocml/dist/man.7
similarity index 100%
rename from usr.bin/mdocml/dist/man.7
rename to external/bsd/mdocml/dist/man.7
diff --git a/usr.bin/mdocml/dist/man.c b/external/bsd/mdocml/dist/man.c
similarity index 100%
rename from usr.bin/mdocml/dist/man.c
rename to external/bsd/mdocml/dist/man.c
diff --git a/usr.bin/mdocml/dist/man.h b/external/bsd/mdocml/dist/man.h
similarity index 100%
rename from usr.bin/mdocml/dist/man.h
rename to external/bsd/mdocml/dist/man.h
diff --git a/usr.bin/mdocml/dist/man_argv.c b/external/bsd/mdocml/dist/man_argv.c
similarity index 100%
rename from usr.bin/mdocml/dist/man_argv.c
rename to external/bsd/mdocml/dist/man_argv.c
diff --git a/usr.bin/mdocml/dist/man_hash.c b/external/bsd/mdocml/dist/man_hash.c
similarity index 100%
rename from usr.bin/mdocml/dist/man_hash.c
rename to external/bsd/mdocml/dist/man_hash.c
diff --git a/usr.bin/mdocml/dist/man_html.c b/external/bsd/mdocml/dist/man_html.c
similarity index 100%
rename from usr.bin/mdocml/dist/man_html.c
rename to external/bsd/mdocml/dist/man_html.c
diff --git a/usr.bin/mdocml/dist/man_macro.c b/external/bsd/mdocml/dist/man_macro.c
similarity index 100%
rename from usr.bin/mdocml/dist/man_macro.c
rename to external/bsd/mdocml/dist/man_macro.c
diff --git a/usr.bin/mdocml/dist/man_term.c b/external/bsd/mdocml/dist/man_term.c
similarity index 100%
rename from usr.bin/mdocml/dist/man_term.c
rename to external/bsd/mdocml/dist/man_term.c
diff --git a/usr.bin/mdocml/dist/man_validate.c b/external/bsd/mdocml/dist/man_validate.c
similarity index 100%
rename from usr.bin/mdocml/dist/man_validate.c
rename to external/bsd/mdocml/dist/man_validate.c
diff --git a/usr.bin/mdocml/dist/mandoc b/external/bsd/mdocml/dist/mandoc
similarity index 100%
rename from usr.bin/mdocml/dist/mandoc
rename to external/bsd/mdocml/dist/mandoc
diff --git a/usr.bin/mdocml/dist/mandoc.1 b/external/bsd/mdocml/dist/mandoc.1
similarity index 100%
rename from usr.bin/mdocml/dist/mandoc.1
rename to external/bsd/mdocml/dist/mandoc.1
diff --git a/usr.bin/mdocml/dist/mandoc.c b/external/bsd/mdocml/dist/mandoc.c
similarity index 100%
rename from usr.bin/mdocml/dist/mandoc.c
rename to external/bsd/mdocml/dist/mandoc.c
diff --git a/usr.bin/mdocml/dist/mandoc.h b/external/bsd/mdocml/dist/mandoc.h
similarity index 100%
rename from usr.bin/mdocml/dist/mandoc.h
rename to external/bsd/mdocml/dist/mandoc.h
diff --git a/usr.bin/mdocml/dist/mandoc_char.7 b/external/bsd/mdocml/dist/mandoc_char.7
similarity index 100%
rename from usr.bin/mdocml/dist/mandoc_char.7
rename to external/bsd/mdocml/dist/mandoc_char.7
diff --git a/usr.bin/mdocml/dist/mdoc.3 b/external/bsd/mdocml/dist/mdoc.3
similarity index 100%
rename from usr.bin/mdocml/dist/mdoc.3
rename to external/bsd/mdocml/dist/mdoc.3
diff --git a/usr.bin/mdocml/dist/mdoc.7 b/external/bsd/mdocml/dist/mdoc.7
similarity index 100%
rename from usr.bin/mdocml/dist/mdoc.7
rename to external/bsd/mdocml/dist/mdoc.7
diff --git a/usr.bin/mdocml/dist/mdoc.c b/external/bsd/mdocml/dist/mdoc.c
similarity index 100%
rename from usr.bin/mdocml/dist/mdoc.c
rename to external/bsd/mdocml/dist/mdoc.c
diff --git a/usr.bin/mdocml/dist/mdoc.h b/external/bsd/mdocml/dist/mdoc.h
similarity index 100%
rename from usr.bin/mdocml/dist/mdoc.h
rename to external/bsd/mdocml/dist/mdoc.h
diff --git a/usr.bin/mdocml/dist/mdoc_argv.c b/external/bsd/mdocml/dist/mdoc_argv.c
similarity index 100%
rename from usr.bin/mdocml/dist/mdoc_argv.c
rename to external/bsd/mdocml/dist/mdoc_argv.c
diff --git a/usr.bin/mdocml/dist/mdoc_hash.c b/external/bsd/mdocml/dist/mdoc_hash.c
similarity index 100%
rename from usr.bin/mdocml/dist/mdoc_hash.c
rename to external/bsd/mdocml/dist/mdoc_hash.c
diff --git a/usr.bin/mdocml/dist/mdoc_html.c b/external/bsd/mdocml/dist/mdoc_html.c
similarity index 100%
rename from usr.bin/mdocml/dist/mdoc_html.c
rename to external/bsd/mdocml/dist/mdoc_html.c
diff --git a/usr.bin/mdocml/dist/mdoc_macro.c b/external/bsd/mdocml/dist/mdoc_macro.c
similarity index 100%
rename from usr.bin/mdocml/dist/mdoc_macro.c
rename to external/bsd/mdocml/dist/mdoc_macro.c
diff --git a/usr.bin/mdocml/dist/mdoc_strings.c b/external/bsd/mdocml/dist/mdoc_strings.c
similarity index 100%
rename from usr.bin/mdocml/dist/mdoc_strings.c
rename to external/bsd/mdocml/dist/mdoc_strings.c
diff --git a/usr.bin/mdocml/dist/mdoc_term.c b/external/bsd/mdocml/dist/mdoc_term.c
similarity index 100%
rename from usr.bin/mdocml/dist/mdoc_term.c
rename to external/bsd/mdocml/dist/mdoc_term.c
diff --git a/usr.bin/mdocml/dist/mdoc_validate.c b/external/bsd/mdocml/dist/mdoc_validate.c
similarity index 100%
rename from usr.bin/mdocml/dist/mdoc_validate.c
rename to external/bsd/mdocml/dist/mdoc_validate.c
diff --git a/usr.bin/mdocml/dist/msec.c b/external/bsd/mdocml/dist/msec.c
similarity index 100%
rename from usr.bin/mdocml/dist/msec.c
rename to external/bsd/mdocml/dist/msec.c
diff --git a/usr.bin/mdocml/dist/msec.in b/external/bsd/mdocml/dist/msec.in
similarity index 100%
rename from usr.bin/mdocml/dist/msec.in
rename to external/bsd/mdocml/dist/msec.in
diff --git a/usr.bin/mdocml/dist/out.c b/external/bsd/mdocml/dist/out.c
similarity index 100%
rename from usr.bin/mdocml/dist/out.c
rename to external/bsd/mdocml/dist/out.c
diff --git a/usr.bin/mdocml/dist/out.h b/external/bsd/mdocml/dist/out.h
similarity index 100%
rename from usr.bin/mdocml/dist/out.h
rename to external/bsd/mdocml/dist/out.h
diff --git a/usr.bin/mdocml/dist/roff.3 b/external/bsd/mdocml/dist/roff.3
similarity index 100%
rename from usr.bin/mdocml/dist/roff.3
rename to external/bsd/mdocml/dist/roff.3
diff --git a/usr.bin/mdocml/dist/roff.7 b/external/bsd/mdocml/dist/roff.7
similarity index 100%
rename from usr.bin/mdocml/dist/roff.7
rename to external/bsd/mdocml/dist/roff.7
diff --git a/usr.bin/mdocml/dist/roff.c b/external/bsd/mdocml/dist/roff.c
similarity index 100%
rename from usr.bin/mdocml/dist/roff.c
rename to external/bsd/mdocml/dist/roff.c
diff --git a/usr.bin/mdocml/dist/roff.h b/external/bsd/mdocml/dist/roff.h
similarity index 100%
rename from usr.bin/mdocml/dist/roff.h
rename to external/bsd/mdocml/dist/roff.h
diff --git a/usr.bin/mdocml/dist/st.c b/external/bsd/mdocml/dist/st.c
similarity index 100%
rename from usr.bin/mdocml/dist/st.c
rename to external/bsd/mdocml/dist/st.c
diff --git a/usr.bin/mdocml/dist/st.in b/external/bsd/mdocml/dist/st.in
similarity index 100%
rename from usr.bin/mdocml/dist/st.in
rename to external/bsd/mdocml/dist/st.in
diff --git a/usr.bin/mdocml/dist/tbl.7 b/external/bsd/mdocml/dist/tbl.7
similarity index 100%
rename from usr.bin/mdocml/dist/tbl.7
rename to external/bsd/mdocml/dist/tbl.7
diff --git a/usr.bin/mdocml/dist/tbl.c b/external/bsd/mdocml/dist/tbl.c
similarity index 100%
rename from usr.bin/mdocml/dist/tbl.c
rename to external/bsd/mdocml/dist/tbl.c
diff --git a/usr.bin/mdocml/dist/tbl_data.c b/external/bsd/mdocml/dist/tbl_data.c
similarity index 100%
rename from usr.bin/mdocml/dist/tbl_data.c
rename to external/bsd/mdocml/dist/tbl_data.c
diff --git a/usr.bin/mdocml/dist/tbl_html.c b/external/bsd/mdocml/dist/tbl_html.c
similarity index 100%
rename from usr.bin/mdocml/dist/tbl_html.c
rename to external/bsd/mdocml/dist/tbl_html.c
diff --git a/usr.bin/mdocml/dist/tbl_layout.c b/external/bsd/mdocml/dist/tbl_layout.c
similarity index 100%
rename from usr.bin/mdocml/dist/tbl_layout.c
rename to external/bsd/mdocml/dist/tbl_layout.c
diff --git a/usr.bin/mdocml/dist/tbl_opts.c b/external/bsd/mdocml/dist/tbl_opts.c
similarity index 100%
rename from usr.bin/mdocml/dist/tbl_opts.c
rename to external/bsd/mdocml/dist/tbl_opts.c
diff --git a/usr.bin/mdocml/dist/tbl_term.c b/external/bsd/mdocml/dist/tbl_term.c
similarity index 100%
rename from usr.bin/mdocml/dist/tbl_term.c
rename to external/bsd/mdocml/dist/tbl_term.c
diff --git a/usr.bin/mdocml/dist/term.c b/external/bsd/mdocml/dist/term.c
similarity index 100%
rename from usr.bin/mdocml/dist/term.c
rename to external/bsd/mdocml/dist/term.c
diff --git a/usr.bin/mdocml/dist/term.h b/external/bsd/mdocml/dist/term.h
similarity index 100%
rename from usr.bin/mdocml/dist/term.h
rename to external/bsd/mdocml/dist/term.h
diff --git a/usr.bin/mdocml/dist/term_ascii.c b/external/bsd/mdocml/dist/term_ascii.c
similarity index 100%
rename from usr.bin/mdocml/dist/term_ascii.c
rename to external/bsd/mdocml/dist/term_ascii.c
diff --git a/usr.bin/mdocml/dist/term_ps.c b/external/bsd/mdocml/dist/term_ps.c
similarity index 100%
rename from usr.bin/mdocml/dist/term_ps.c
rename to external/bsd/mdocml/dist/term_ps.c
diff --git a/usr.bin/mdocml/dist/test-strlcat.c b/external/bsd/mdocml/dist/test-strlcat.c
similarity index 100%
rename from usr.bin/mdocml/dist/test-strlcat.c
rename to external/bsd/mdocml/dist/test-strlcat.c
diff --git a/usr.bin/mdocml/dist/test-strlcpy.c b/external/bsd/mdocml/dist/test-strlcpy.c
similarity index 100%
rename from usr.bin/mdocml/dist/test-strlcpy.c
rename to external/bsd/mdocml/dist/test-strlcpy.c
diff --git a/usr.bin/mdocml/dist/tree.c b/external/bsd/mdocml/dist/tree.c
similarity index 100%
rename from usr.bin/mdocml/dist/tree.c
rename to external/bsd/mdocml/dist/tree.c
diff --git a/usr.bin/mdocml/dist/vol.c b/external/bsd/mdocml/dist/vol.c
similarity index 100%
rename from usr.bin/mdocml/dist/vol.c
rename to external/bsd/mdocml/dist/vol.c
diff --git a/usr.bin/mdocml/dist/vol.in b/external/bsd/mdocml/dist/vol.in
similarity index 100%
rename from usr.bin/mdocml/dist/vol.in
rename to external/bsd/mdocml/dist/vol.in
diff --git a/usr.bin/mdocml/lib/Makefile b/external/bsd/mdocml/lib/Makefile
similarity index 100%
rename from usr.bin/mdocml/lib/Makefile
rename to external/bsd/mdocml/lib/Makefile
diff --git a/usr.bin/mdocml/lib/Makefile.inc b/external/bsd/mdocml/lib/Makefile.inc
similarity index 100%
rename from usr.bin/mdocml/lib/Makefile.inc
rename to external/bsd/mdocml/lib/Makefile.inc
diff --git a/usr.bin/mdocml/lib/libman/Makefile b/external/bsd/mdocml/lib/libman/Makefile
similarity index 100%
rename from usr.bin/mdocml/lib/libman/Makefile
rename to external/bsd/mdocml/lib/libman/Makefile
diff --git a/usr.bin/mdocml/lib/libmdoc/Makefile b/external/bsd/mdocml/lib/libmdoc/Makefile
similarity index 100%
rename from usr.bin/mdocml/lib/libmdoc/Makefile
rename to external/bsd/mdocml/lib/libmdoc/Makefile
diff --git a/usr.bin/mdocml/lib/libroff/Makefile b/external/bsd/mdocml/lib/libroff/Makefile
similarity index 100%
rename from usr.bin/mdocml/lib/libroff/Makefile
rename to external/bsd/mdocml/lib/libroff/Makefile
diff --git a/usr.bin/mdocml/man/Makefile b/external/bsd/mdocml/man/Makefile
similarity index 100%
rename from usr.bin/mdocml/man/Makefile
rename to external/bsd/mdocml/man/Makefile
diff --git a/usr.bin/mdocml/prepare-import.sh b/external/bsd/mdocml/prepare-import.sh
similarity index 100%
rename from usr.bin/mdocml/prepare-import.sh
rename to external/bsd/mdocml/prepare-import.sh
diff --git a/include/Makefile b/include/Makefile
index e95fdcda4..5a120d103 100644
--- a/include/Makefile
+++ b/include/Makefile
@@ -25,19 +25,6 @@ INCS=	a.out.h aio.h ar.h assert.h atomic.h \
 	ttyent.h tzfile.h ucontext.h ulimit.h unistd.h util.h utime.h utmp.h \
 	utmpx.h uuid.h varargs.h vis.h wchar.h wctype.h wordexp.h
 
-INCS +=	ufs/chfs/chfs.h ufs/chfs/chfs_args.h ufs/chfs/chfs_inode.h \
-	ufs/chfs/chfs_pool.h ufs/chfs/debug.h ufs/chfs/ebh.h \
-	ufs/chfs/ebh_media.h ufs/chfs/ebh_misc.h ufs/chfs/media.h \
-	ufs/ext2fs/ext2fs.h ufs/ext2fs/ext2fs_dinode.h \
-	ufs/ext2fs/ext2fs_dir.h ufs/ext2fs/ext2fs_extern.h \
-	ufs/ffs/ffs_extern.h ufs/ffs/fs.h ufs/lfs/lfs.h \
-	ufs/lfs/lfs_extern.h ufs/mfs/mfs_extern.h ufs/mfs/mfsnode.h \
-	ufs/ufs/dinode.h ufs/ufs/dir.h ufs/ufs/dirhash.h \
-	ufs/ufs/extattr.h ufs/ufs/inode.h ufs/ufs/quota.h \
-	ufs/ufs/quota1.h ufs/ufs/quota2.h ufs/ufs/ufs_bswap.h \
-	ufs/ufs/ufs_extern.h ufs/ufs/ufs_quota.h ufs/ufs/ufs_wapbl.h \
-	ufs/ufs/ufsmount.h \
-
 .else
 INCS=	a.out.h aio.h ar.h assert.h atomic.h \
 	bitstring.h bm.h cdbr.h cdbw.h complex.h cpio.h ctype.h \
diff --git a/include/arch/i386/include/Makefile b/include/arch/i386/include/Makefile
index 93281aee8..8a1150c65 100644
--- a/include/arch/i386/include/Makefile
+++ b/include/arch/i386/include/Makefile
@@ -9,8 +9,12 @@ INCS= 	ansi.h asm.h bswap.h byte_swap.h cdefs.h \
 	int_mwgwtypes.h int_types.h limits.h \
 	math.h mcontext.h npx.h param.h profile.h \
 	setjmp.h signal.h stdarg.h types.h \
-	vmparam.h wchar_limits.h
+	vmparam.h wchar_limits.h \
+	archtypes.h bios.h cmos.h cpu.h diskparm.h fpu.h int86.h \
+	interrupt.h memory.h multiboot.h partition.h \
+	pci.h pci_amd.h pci_intel.h pci_sis.h pci_via.h \
+	ports.h stackframe.h vm.h elf.h elf_machdep.h mutex.h \
+	disklabel.h
 
 
-.include "${MINIXSRCDIR}/common/include/arch/i386/Makefile.inc"
 .include <bsd.kinc.mk>
diff --git a/common/include/arch/i386/Makefile.inc b/include/arch/i386/include/Makefile.inc
similarity index 100%
rename from common/include/arch/i386/Makefile.inc
rename to include/arch/i386/include/Makefile.inc
diff --git a/common/include/arch/i386/archtypes.h b/include/arch/i386/include/archtypes.h
similarity index 100%
rename from common/include/arch/i386/archtypes.h
rename to include/arch/i386/include/archtypes.h
diff --git a/common/include/arch/i386/bios.h b/include/arch/i386/include/bios.h
similarity index 100%
rename from common/include/arch/i386/bios.h
rename to include/arch/i386/include/bios.h
diff --git a/common/include/arch/i386/cmos.h b/include/arch/i386/include/cmos.h
similarity index 100%
rename from common/include/arch/i386/cmos.h
rename to include/arch/i386/include/cmos.h
diff --git a/common/include/arch/i386/cpu.h b/include/arch/i386/include/cpu.h
similarity index 100%
rename from common/include/arch/i386/cpu.h
rename to include/arch/i386/include/cpu.h
diff --git a/include/arch/i386/include/disklabel.h b/include/arch/i386/include/disklabel.h
index bf567de6d..e7d5246bc 100644
--- a/include/arch/i386/include/disklabel.h
+++ b/include/arch/i386/include/disklabel.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: disklabel.h,v 1.15 2009/11/23 13:40:10 pooka Exp $	*/
+/*	$NetBSD: disklabel.h,v 1.16 2011/08/30 12:39:55 bouyer Exp $	*/
 
 /*
  * Copyright (c) 1994 Christopher G. Demetriou
@@ -33,6 +33,7 @@
 #ifndef _I386_DISKLABEL_H_
 #define _I386_DISKLABEL_H_
 
+#define LABELUSESMBR		1	/* use MBR partitionning */
 #define	LABELSECTOR		1	/* sector containing label */
 #define	LABELOFFSET		0	/* offset of label in sector */
 #define	MAXPARTITIONS		16	/* number of partitions */
diff --git a/common/include/arch/i386/diskparm.h b/include/arch/i386/include/diskparm.h
similarity index 100%
rename from common/include/arch/i386/diskparm.h
rename to include/arch/i386/include/diskparm.h
diff --git a/common/include/arch/i386/elf.h b/include/arch/i386/include/elf.h
similarity index 100%
rename from common/include/arch/i386/elf.h
rename to include/arch/i386/include/elf.h
diff --git a/common/include/arch/i386/elf_machdep.h b/include/arch/i386/include/elf_machdep.h
similarity index 100%
rename from common/include/arch/i386/elf_machdep.h
rename to include/arch/i386/include/elf_machdep.h
diff --git a/common/include/arch/i386/fpu.h b/include/arch/i386/include/fpu.h
similarity index 100%
rename from common/include/arch/i386/fpu.h
rename to include/arch/i386/include/fpu.h
diff --git a/common/include/arch/i386/int86.h b/include/arch/i386/include/int86.h
similarity index 100%
rename from common/include/arch/i386/int86.h
rename to include/arch/i386/include/int86.h
diff --git a/common/include/arch/i386/interrupt.h b/include/arch/i386/include/interrupt.h
similarity index 100%
rename from common/include/arch/i386/interrupt.h
rename to include/arch/i386/include/interrupt.h
diff --git a/common/include/arch/i386/memory.h b/include/arch/i386/include/memory.h
similarity index 100%
rename from common/include/arch/i386/memory.h
rename to include/arch/i386/include/memory.h
diff --git a/common/include/arch/i386/multiboot.h b/include/arch/i386/include/multiboot.h
similarity index 100%
rename from common/include/arch/i386/multiboot.h
rename to include/arch/i386/include/multiboot.h
diff --git a/common/include/arch/i386/mutex.h b/include/arch/i386/include/mutex.h
similarity index 100%
rename from common/include/arch/i386/mutex.h
rename to include/arch/i386/include/mutex.h
diff --git a/common/include/arch/i386/partition.h b/include/arch/i386/include/partition.h
similarity index 100%
rename from common/include/arch/i386/partition.h
rename to include/arch/i386/include/partition.h
diff --git a/common/include/arch/i386/pci.h b/include/arch/i386/include/pci.h
similarity index 100%
rename from common/include/arch/i386/pci.h
rename to include/arch/i386/include/pci.h
diff --git a/common/include/arch/i386/pci_amd.h b/include/arch/i386/include/pci_amd.h
similarity index 100%
rename from common/include/arch/i386/pci_amd.h
rename to include/arch/i386/include/pci_amd.h
diff --git a/common/include/arch/i386/pci_intel.h b/include/arch/i386/include/pci_intel.h
similarity index 100%
rename from common/include/arch/i386/pci_intel.h
rename to include/arch/i386/include/pci_intel.h
diff --git a/common/include/arch/i386/pci_sis.h b/include/arch/i386/include/pci_sis.h
similarity index 100%
rename from common/include/arch/i386/pci_sis.h
rename to include/arch/i386/include/pci_sis.h
diff --git a/common/include/arch/i386/pci_via.h b/include/arch/i386/include/pci_via.h
similarity index 100%
rename from common/include/arch/i386/pci_via.h
rename to include/arch/i386/include/pci_via.h
diff --git a/common/include/arch/i386/ports.h b/include/arch/i386/include/ports.h
similarity index 100%
rename from common/include/arch/i386/ports.h
rename to include/arch/i386/include/ports.h
diff --git a/common/include/arch/i386/stackframe.h b/include/arch/i386/include/stackframe.h
similarity index 100%
rename from common/include/arch/i386/stackframe.h
rename to include/arch/i386/include/stackframe.h
diff --git a/common/include/arch/i386/vm.h b/include/arch/i386/include/vm.h
similarity index 100%
rename from common/include/arch/i386/vm.h
rename to include/arch/i386/include/vm.h
diff --git a/sys/Makefile b/sys/Makefile
index 3bee04ec7..1861b4903 100644
--- a/sys/Makefile
+++ b/sys/Makefile
@@ -6,5 +6,6 @@ SUBDIR= arch/i386/stand/mbr
 SUBDIR+= arch/i386/stand/bootxx
 SUBDIR+= arch/i386/stand/boot
 SUBDIR+= arch/i386/stand/cdboot
+SUBDIR+= ufs
 
 .include <bsd.subdir.mk>
diff --git a/sys/ufs/Makefile b/sys/ufs/Makefile
new file mode 100644
index 000000000..c06bbf848
--- /dev/null
+++ b/sys/ufs/Makefile
@@ -0,0 +1,7 @@
+#	$NetBSD: Makefile,v 1.2 2002/11/26 23:30:35 lukem Exp $
+
+SUBDIR=	ffs lfs mfs ufs ext2fs
+
+INCSDIR= /usr/include/ufs
+
+.include <bsd.kinc.mk>
diff --git a/include/ufs/chfs/chfs.h b/sys/ufs/chfs/chfs.h
similarity index 100%
rename from include/ufs/chfs/chfs.h
rename to sys/ufs/chfs/chfs.h
diff --git a/include/ufs/chfs/chfs_args.h b/sys/ufs/chfs/chfs_args.h
similarity index 100%
rename from include/ufs/chfs/chfs_args.h
rename to sys/ufs/chfs/chfs_args.h
diff --git a/sys/ufs/chfs/chfs_build.c b/sys/ufs/chfs/chfs_build.c
new file mode 100644
index 000000000..3904b023a
--- /dev/null
+++ b/sys/ufs/chfs/chfs_build.c
@@ -0,0 +1,405 @@
+/*	$NetBSD: chfs_build.c,v 1.2 2011/11/24 21:22:39 agc Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
+
+void
+chfs_calc_trigger_levels(struct chfs_mount *chmp)
+{
+	uint32_t size;
+
+	chmp->chm_resv_blocks_deletion = 2;
+
+	size = chmp->chm_ebh->flash_size / 50;  //2% of flash size
+	size += chmp->chm_ebh->peb_nr * 100;
+	size += chmp->chm_ebh->eb_size - 1;
+
+	chmp->chm_resv_blocks_write =
+	    chmp->chm_resv_blocks_deletion + (size / chmp->chm_ebh->eb_size);
+	chmp->chm_resv_blocks_gctrigger = chmp->chm_resv_blocks_write + 1;
+	chmp->chm_resv_blocks_gcmerge = chmp->chm_resv_blocks_deletion + 1;
+	chmp->chm_vdirty_blocks_gctrigger = chmp->chm_resv_blocks_gctrigger * 10;
+
+	chmp->chm_nospc_dirty =
+	    chmp->chm_ebh->eb_size + (chmp->chm_ebh->flash_size / 100);
+}
+
+
+/**
+ * chfs_build_set_vnodecache_nlink - set pvno and nlink in vnodecaches
+ * @chmp: CHFS main descriptor structure
+ * @vc: vnode cache
+ * This function travels @vc's directory entries and sets the pvno and nlink
+ * attribute of the vnode where the dirent's vno points.
+ */
+void
+chfs_build_set_vnodecache_nlink(struct chfs_mount *chmp,
+    struct chfs_vnode_cache *vc)
+{
+	struct chfs_dirent *fd;
+	//dbg("set nlink\n");
+
+//	for (fd = vc->scan_dirents; fd; fd = fd->next) {
+	TAILQ_FOREACH(fd, &vc->scan_dirents, fds) {
+		struct chfs_vnode_cache *child_vc;
+
+		if (!fd->vno)
+			continue;
+
+		mutex_enter(&chmp->chm_lock_vnocache);
+		child_vc = chfs_vnode_cache_get(chmp, fd->vno);
+		mutex_exit(&chmp->chm_lock_vnocache);
+		if (!child_vc) {
+			chfs_mark_node_obsolete(chmp, fd->nref);
+			continue;
+		}
+		if (fd->type == VDIR) {
+			if (child_vc->nlink < 1)
+				child_vc->nlink = 1;
+
+			if (child_vc->pvno) {
+				chfs_err("found a hard link: child dir: %s"
+				    ", (vno: %llu) of dir vno: %llu\n",
+				    fd->name, (unsigned long long)fd->vno,
+				    (unsigned long long)vc->vno);
+			} else {
+				//dbg("child_vc->pvno =
+				//	vc->vno; pvno = %d\n", child_vc->pvno);
+				child_vc->pvno = vc->vno;
+			}
+		}
+		child_vc->nlink++;
+		//dbg("child_vc->nlink++;\n");
+		//child_vc->nlink++;
+		vc->nlink++;
+	}
+}
+
+/**
+ * chfs_build_remove_unlinked vnode
+ */
+/* static */
+void
+chfs_build_remove_unlinked_vnode(struct chfs_mount *chmp,
+    struct chfs_vnode_cache *vc,
+//    struct chfs_dirent **unlinked)
+    struct chfs_dirent_list *unlinked)
+{
+	struct chfs_node_ref *nref;
+	struct chfs_dirent *fd, *tmpfd;
+
+	dbg("START\n");
+	dbg("vno: %llu\n", (unsigned long long)vc->vno);
+
+	nref = vc->dnode;
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+	// The vnode cache is at the end of the data node's chain
+	while (nref != (struct chfs_node_ref *)vc) {
+		struct chfs_node_ref *next = nref->nref_next;
+		dbg("mark dnode\n");
+		chfs_mark_node_obsolete(chmp, nref);
+		nref = next;
+	}
+	nref = vc->dirents;
+	// The vnode cache is at the end of the dirent node's chain
+	while (nref != (struct chfs_node_ref *)vc) {
+		struct chfs_node_ref *next = nref->nref_next;
+		dbg("mark dirent\n");
+		chfs_mark_node_obsolete(chmp, nref);
+		nref = next;
+	}
+	if (!TAILQ_EMPTY(&vc->scan_dirents)) {
+		TAILQ_FOREACH_SAFE(fd, &vc->scan_dirents, fds, tmpfd) {
+//		while (vc->scan_dirents) {
+			struct chfs_vnode_cache *child_vc;
+//			fd = vc->scan_dirents;
+			dbg("dirent dump:\n");
+			dbg(" ->vno:     %llu\n", (unsigned long long)fd->vno);
+			dbg(" ->version: %llu\n", (unsigned long long)fd->version);
+			dbg(" ->nhash:   0x%x\n", fd->nhash);
+			dbg(" ->nsize:   %d\n", fd->nsize);
+			dbg(" ->name:    %s\n", fd->name);
+			dbg(" ->type:    %d\n", fd->type);
+//			vc->scan_dirents = fd->next;
+			TAILQ_REMOVE(&vc->scan_dirents, fd, fds);
+
+			if (!fd->vno) {
+				chfs_free_dirent(fd);
+				continue;
+			}
+			mutex_enter(&chmp->chm_lock_vnocache);
+			child_vc = chfs_vnode_cache_get(chmp, fd->vno);
+			mutex_exit(&chmp->chm_lock_vnocache);
+			if (!child_vc) {
+				chfs_free_dirent(fd);
+				continue;
+			}
+			/**
+			 * Decrease nlink in child. If it is 0, add to unlinked
+			 * dirents or just free it otherwise.
+			 */
+			child_vc->nlink--;
+
+			if (!child_vc->nlink) {
+				//dbg("nlink is 0\n");
+//				fd->next = *unlinked;
+//				*unlinked = fd;
+				// XXX HEAD or TAIL?
+				// original code did HEAD, but we could add
+				// it to the TAIL easily with TAILQ.
+				TAILQ_INSERT_TAIL(unlinked, fd, fds);
+			} else {
+				chfs_free_dirent(fd);
+			}
+		}
+	} else {
+		dbg("there are no scan dirents\n");
+	}
+
+	nref = vc->v;
+	while ((struct chfs_vnode_cache *)nref != vc) {
+		if (!CHFS_REF_OBSOLETE(nref))
+			chfs_mark_node_obsolete(chmp, nref);
+		nref = nref->nref_next;
+	}
+
+	mutex_enter(&chmp->chm_lock_vnocache);
+	if (vc->vno != CHFS_ROOTINO)
+		chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_UNCHECKED);
+	mutex_exit(&chmp->chm_lock_vnocache);
+	dbg("END\n");
+}
+
+/**
+ * chfs_build_filesystem - build in-memory representation of filesystem
+ * @chmp: super block information
+ *
+ * Step 1:
+ * This function scans through the eraseblocks mapped in EBH.
+ * During scan builds up the map of vnodes and directory entries and puts them
+ * into the vnode_cache.
+ * Step 2:
+ * Scans the directory tree and set the nlink in the vnode caches.
+ * Step 3:
+ * Scans vnode caches with nlink = 0
+ */
+int
+chfs_build_filesystem(struct chfs_mount *chmp)
+{
+	int i,err = 0;
+	struct chfs_vnode_cache *vc;
+	struct chfs_dirent *fd, *tmpfd;
+//	struct chfs_dirent *unlinked = NULL;
+	struct chfs_node_ref **nref;
+	struct chfs_dirent_list unlinked;
+	struct chfs_vnode_cache *notregvc;
+
+	TAILQ_INIT(&unlinked);
+
+	mutex_enter(&chmp->chm_lock_mountfields);
+
+	/**
+	 * Step 1
+	 */
+	chmp->chm_flags |= CHFS_MP_FLAG_SCANNING;
+	for (i = 0; i < chmp->chm_ebh->peb_nr; i++) {
+		//dbg("processing block: %d\n", i);
+		chmp->chm_blocks[i].lnr = i;
+		chmp->chm_blocks[i].free_size = chmp->chm_ebh->eb_size;
+		//If the LEB is add to free list skip it.
+		if (chmp->chm_ebh->lmap[i] < 0) {
+			//dbg("block %d is unmapped\n", i);
+			TAILQ_INSERT_TAIL(&chmp->chm_free_queue,
+			    &chmp->chm_blocks[i], queue);
+			chmp->chm_nr_free_blocks++;
+			continue;
+		}
+
+		err = chfs_scan_eraseblock(chmp, &chmp->chm_blocks[i]);
+		switch (err) {
+		case CHFS_BLK_STATE_FREE:
+			chmp->chm_nr_free_blocks++;
+			TAILQ_INSERT_TAIL(&chmp->chm_free_queue,
+			    &chmp->chm_blocks[i], queue);
+			break;
+		case CHFS_BLK_STATE_CLEAN:
+			TAILQ_INSERT_TAIL(&chmp->chm_clean_queue,
+			    &chmp->chm_blocks[i], queue);
+			break;
+		case CHFS_BLK_STATE_PARTDIRTY:
+			//dbg("free size: %d\n", chmp->chm_blocks[i].free_size);
+			if (chmp->chm_blocks[i].free_size > chmp->chm_wbuf_pagesize &&
+			    (!chmp->chm_nextblock ||
+				chmp->chm_blocks[i].free_size >
+				chmp->chm_nextblock->free_size)) {
+				/* convert the old nextblock's free size to
+				 * dirty and put it on a list */
+				if (chmp->chm_nextblock) {
+					err = chfs_close_eraseblock(chmp,
+					    chmp->chm_nextblock);
+					if (err)
+						return err;
+				}
+				chmp->chm_nextblock = &chmp->chm_blocks[i];
+			} else {
+				/* convert the scanned block's free size to
+				 * dirty and put it on a list */
+				err = chfs_close_eraseblock(chmp,
+				    &chmp->chm_blocks[i]);
+				if (err)
+					return err;
+			}
+			break;
+		case CHFS_BLK_STATE_ALLDIRTY:
+			/*
+			 * The block has a valid EBH header, but it doesn't
+			 * contain any valid data.
+			 */
+			TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue,
+			    &chmp->chm_blocks[i], queue);
+			chmp->chm_nr_erasable_blocks++;
+			break;
+		default:
+			/* It was an error, unknown  state */
+			break;
+		}
+
+	}
+	chmp->chm_flags &= ~CHFS_MP_FLAG_SCANNING;
+
+
+	//TODO need bad block check (and bad block handling in EBH too!!)
+	/* Now EBH only checks block is bad  during its scan operation.
+	 * Need check at erase + write + read...
+	 */
+
+	/**
+	 * Step 2
+	 */
+	chmp->chm_flags |= CHFS_MP_FLAG_BUILDING;
+	for (i = 0; i < VNODECACHE_SIZE; i++) {
+		vc = chmp->chm_vnocache_hash[i];
+		while (vc) {
+			dbg("vc->vno: %llu\n", (unsigned long long)vc->vno);
+			if (!TAILQ_EMPTY(&vc->scan_dirents))
+				chfs_build_set_vnodecache_nlink(chmp, vc);
+			vc = vc->next;
+		}
+	}
+
+	/**
+	 * Step 3
+	 * Scan for vnodes with 0 nlink.
+	 */
+	for (i =  0; i < VNODECACHE_SIZE; i++) {
+		vc = chmp->chm_vnocache_hash[i];
+		while (vc) {
+			if (vc->nlink) {
+				vc = vc->next;
+				continue;
+			}
+
+			//dbg("remove unlinked start i: %d\n", i);
+			chfs_build_remove_unlinked_vnode(chmp,
+			    vc, &unlinked);
+			//dbg("remove unlinked end\n");
+			vc = vc->next;
+		}
+	}
+	/* Remove the newly unlinked vnodes. They are on the unlinked list */
+	TAILQ_FOREACH_SAFE(fd, &unlinked, fds, tmpfd) {
+//	while (unlinked) {
+//		fd = unlinked;
+//		unlinked = fd->next;
+		TAILQ_REMOVE(&unlinked, fd, fds);
+		mutex_enter(&chmp->chm_lock_vnocache);
+		vc = chfs_vnode_cache_get(chmp, fd->vno);
+		mutex_exit(&chmp->chm_lock_vnocache);
+		if (vc) {
+			chfs_build_remove_unlinked_vnode(chmp,
+			    vc, &unlinked);
+		}
+		chfs_free_dirent(fd);
+	}
+
+	chmp->chm_flags &= ~CHFS_MP_FLAG_BUILDING;
+
+	/* Free all dirents */
+	for (i =  0; i < VNODECACHE_SIZE; i++) {
+		vc = chmp->chm_vnocache_hash[i];
+		while (vc) {
+			TAILQ_FOREACH_SAFE(fd, &vc->scan_dirents, fds, tmpfd) {
+//			while (vc->scan_dirents) {
+//				fd = vc->scan_dirents;
+//				vc->scan_dirents = fd->next;
+				TAILQ_REMOVE(&vc->scan_dirents, fd, fds);
+				if (fd->vno == 0) {
+					//for (nref = &vc->dirents;
+					//     *nref != fd->nref;
+					//     nref = &((*nref)->next));
+
+					nref = &fd->nref;
+					*nref = fd->nref->nref_next;
+					//fd->nref->nref_next = NULL;
+				} else if (fd->type == VDIR) {
+					//set state every non-VREG file's vc
+					mutex_enter(&chmp->chm_lock_vnocache);
+					notregvc =
+					    chfs_vnode_cache_get(chmp,
+						fd->vno);
+					chfs_vnode_cache_set_state(chmp,
+					    notregvc, VNO_STATE_PRESENT);
+					mutex_exit(&chmp->chm_lock_vnocache);
+				}
+				chfs_free_dirent(fd);
+			}
+//			vc->scan_dirents = NULL;
+			KASSERT(TAILQ_EMPTY(&vc->scan_dirents));
+			vc = vc->next;
+		}
+	}
+
+	//Set up chmp->chm_wbuf_ofs for the first write
+	if (chmp->chm_nextblock) {
+		dbg("free_size: %d\n", chmp->chm_nextblock->free_size);
+		chmp->chm_wbuf_ofs = chmp->chm_ebh->eb_size -
+		    chmp->chm_nextblock->free_size;
+	} else {
+		chmp->chm_wbuf_ofs = 0xffffffff;
+	}
+	mutex_exit(&chmp->chm_lock_mountfields);
+
+	return 0;
+}
+
diff --git a/sys/ufs/chfs/chfs_erase.c b/sys/ufs/chfs/chfs_erase.c
new file mode 100644
index 000000000..9ae49c37c
--- /dev/null
+++ b/sys/ufs/chfs/chfs_erase.c
@@ -0,0 +1,137 @@
+/*	$NetBSD: chfs_erase.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * Copyright (c) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * chfs_erase.c
+ *
+ * Copyright (C) 2010  David Tengeri <dtengeri@inf.u-szeged.hu>,
+ *                     ...
+ *                     University of Szeged, Hungary
+ */
+
+#include "chfs.h"
+
+
+/**
+ * chfs_remap_leb - unmap and then map a leb
+ * @chmp: chfs mount structure
+ *
+ * This function gets an eraseblock from the erasable queue, unmaps it through
+ * EBH and maps another eraseblock to the same LNR.
+ * EBH will find a free eraseblock if any or will erase one if there isn't any
+ * free, just dirty block.
+ *
+ * Returns zero on case of success, errorcode otherwise.
+ *
+ * Needs more brainstorming here.
+ */
+int
+chfs_remap_leb(struct chfs_mount *chmp)
+{
+	int err;
+	struct chfs_eraseblock *cheb;
+	dbg("chfs_remap_leb\n");
+	uint32_t dirty, unchecked, used, free, wasted;
+
+	//dbg("chmp->chm_nr_erasable_blocks: %d\n", chmp->chm_nr_erasable_blocks);
+	//dbg("ltree: %p ecl: %p\n", &chmp->chm_ebh->ltree_lock, &chmp->chm_lock_sizes);
+	KASSERT(!rw_write_held(&chmp->chm_lock_wbuf));
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+	KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+
+	if (!chmp->chm_nr_erasable_blocks) {
+		//TODO
+		/* We don't have any erasable blocks, need to check if there are
+		 * blocks on erasable_pending_wbuf_queue, flush the data and then
+		 * we can remap it.
+		 * If there aren't any blocks on that list too, we need to GC?
+		 */
+		if (!TAILQ_EMPTY(&chmp->chm_erasable_pending_wbuf_queue)) {
+			cheb = TAILQ_FIRST(&chmp->chm_erasable_pending_wbuf_queue);
+			TAILQ_REMOVE(&chmp->chm_erasable_pending_wbuf_queue, cheb, queue);
+			if (chmp->chm_wbuf_len) {
+				mutex_exit(&chmp->chm_lock_sizes);
+				chfs_flush_pending_wbuf(chmp);
+				mutex_enter(&chmp->chm_lock_sizes);
+			}
+			TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue, cheb, queue);
+			chmp->chm_nr_erasable_blocks++;
+		} else {
+			/* We can't delete any block. */
+			//FIXME should we return ENOSPC?
+			return ENOSPC;
+		}
+	}
+	cheb = TAILQ_FIRST(&chmp->chm_erase_pending_queue);
+	TAILQ_REMOVE(&chmp->chm_erase_pending_queue, cheb, queue);
+	chmp->chm_nr_erasable_blocks--;
+	
+	dirty = cheb->dirty_size;
+	unchecked = cheb->unchecked_size;
+	used = cheb->used_size;
+	free = cheb->free_size;
+	wasted = cheb->wasted_size;
+
+	// Free allocated node references for this eraseblock
+	chfs_free_node_refs(cheb);
+
+	err = chfs_unmap_leb(chmp, cheb->lnr);
+	if (err)
+		return err;
+
+	err = chfs_map_leb(chmp, cheb->lnr);
+	if (err)
+		return err;
+	// Reset state to default and change chmp sizes too 
+	chfs_change_size_dirty(chmp, cheb, -dirty);
+	chfs_change_size_unchecked(chmp, cheb, -unchecked);
+	chfs_change_size_used(chmp, cheb, -used);
+	chfs_change_size_free(chmp, cheb, chmp->chm_ebh->eb_size - free);
+	chfs_change_size_wasted(chmp, cheb, -wasted);
+
+	KASSERT(cheb->dirty_size == 0);
+	KASSERT(cheb->unchecked_size == 0);
+	KASSERT(cheb->used_size == 0);
+	KASSERT(cheb->free_size == chmp->chm_ebh->eb_size);
+	KASSERT(cheb->wasted_size == 0);
+
+	cheb->first_node = NULL;
+	cheb->last_node  = NULL;
+	//put it to free_queue
+	TAILQ_INSERT_TAIL(&chmp->chm_free_queue, cheb, queue);
+	chmp->chm_nr_free_blocks++;
+	dbg("remaped (free: %d, erasable: %d)\n", chmp->chm_nr_free_blocks, chmp->chm_nr_erasable_blocks);
+	KASSERT(!TAILQ_EMPTY(&chmp->chm_free_queue));
+
+	return 0;
+}
diff --git a/sys/ufs/chfs/chfs_gc.c b/sys/ufs/chfs/chfs_gc.c
new file mode 100644
index 000000000..aa32d64b9
--- /dev/null
+++ b/sys/ufs/chfs/chfs_gc.c
@@ -0,0 +1,1238 @@
+/*	$NetBSD: chfs_gc.c,v 1.2 2011/11/24 21:09:37 agc Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * Copyright (c) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (c) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+
+void chfs_gc_release_inode(struct chfs_mount *,
+    struct chfs_inode *);
+struct chfs_inode *chfs_gc_fetch_inode(struct chfs_mount *,
+    ino_t, uint32_t);
+int chfs_check(struct chfs_mount *, struct chfs_vnode_cache *);
+void chfs_clear_inode(struct chfs_mount *, struct chfs_inode *);
+
+
+struct chfs_eraseblock *find_gc_block(struct chfs_mount *);
+int chfs_gcollect_pristine(struct chfs_mount *,
+    struct chfs_eraseblock *,
+    struct chfs_vnode_cache *, struct chfs_node_ref *);
+int chfs_gcollect_live(struct chfs_mount *,
+    struct chfs_eraseblock *, struct chfs_node_ref *,
+    struct chfs_inode *);
+int chfs_gcollect_vnode(struct chfs_mount *, struct chfs_inode *);
+int chfs_gcollect_dirent(struct chfs_mount *,
+    struct chfs_eraseblock *, struct chfs_inode *,
+    struct chfs_dirent *);
+int chfs_gcollect_deletion_dirent(struct chfs_mount *,
+    struct chfs_eraseblock *, struct chfs_inode *,
+    struct chfs_dirent *);
+int chfs_gcollect_dnode(struct chfs_mount *,
+    struct chfs_eraseblock *, struct chfs_inode *,
+    struct chfs_full_dnode *, uint32_t, uint32_t);
+
+/* must be called with chm_lock_mountfields held */
+void
+chfs_gc_trigger(struct chfs_mount *chmp)
+{
+	struct garbage_collector_thread *gc = &chmp->chm_gc_thread;
+
+	//mutex_enter(&chmp->chm_lock_sizes);
+	if (gc->gcth_running &&
+	    chfs_gc_thread_should_wake(chmp)) {
+		cv_signal(&gc->gcth_wakeup);
+	}
+	//mutex_exit(&chmp->chm_lock_sizes);
+}
+
+
+void
+chfs_gc_thread(void *data)
+{
+	struct chfs_mount *chmp = data;
+	struct garbage_collector_thread *gc = &chmp->chm_gc_thread;
+
+	dbg_gc("[GC THREAD] thread started\n");
+
+	mutex_enter(&chmp->chm_lock_mountfields);
+	while (gc->gcth_running) {
+		/* we must call chfs_gc_thread_should_wake with chm_lock_mountfields
+		 * held, which is a bit awkwardly done here, but we cant relly
+		 * do it otherway with the current design...
+		 */
+		if (chfs_gc_thread_should_wake(chmp)) {
+//			mutex_exit(&chmp->chm_lock_mountfields);
+			if (chfs_gcollect_pass(chmp) == ENOSPC) {
+				dbg_gc("No space for garbage collection\n");
+				panic("No space for garbage collection\n");
+				/* XXX why break here? i have added a panic
+				 * here to see if it gets triggered -ahoka
+				 */
+				break;
+			}
+			/* XXX gcollect_pass drops the mutex */
+			mutex_enter(&chmp->chm_lock_mountfields);
+		}
+
+		cv_timedwait_sig(&gc->gcth_wakeup,
+		    &chmp->chm_lock_mountfields, mstohz(100));
+	}
+	mutex_exit(&chmp->chm_lock_mountfields);
+
+	dbg_gc("[GC THREAD] thread stopped\n");
+	kthread_exit(0);
+}
+
+void
+chfs_gc_thread_start(struct chfs_mount *chmp)
+{
+	struct garbage_collector_thread *gc = &chmp->chm_gc_thread;
+
+	cv_init(&gc->gcth_wakeup, "chfsgccv");
+
+	gc->gcth_running = true;
+	kthread_create(PRI_NONE, /*KTHREAD_MPSAFE |*/ KTHREAD_MUSTJOIN,
+	    NULL, chfs_gc_thread, chmp, &gc->gcth_thread,
+	    "chfsgcth");
+}
+
+void
+chfs_gc_thread_stop(struct chfs_mount *chmp)
+{
+	struct garbage_collector_thread *gc = &chmp->chm_gc_thread;
+
+	/* check if it is actually running. if not, do nothing */
+	if (gc->gcth_running) {
+		gc->gcth_running = false;
+	} else {
+		return;
+	}
+	cv_signal(&gc->gcth_wakeup);
+	dbg_gc("[GC THREAD] stop signal sent\n");
+
+	kthread_join(gc->gcth_thread);
+#ifdef BROKEN_KTH_JOIN
+	kpause("chfsthjoin", false, mstohz(1000), NULL);
+#endif
+
+	cv_destroy(&gc->gcth_wakeup);
+}
+
+/* must be called with chm_lock_mountfields held */
+int
+chfs_gc_thread_should_wake(struct chfs_mount *chmp)
+{
+	int nr_very_dirty = 0;
+	struct chfs_eraseblock *cheb;
+	uint32_t dirty;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+		dbg_gc("erase_pending\n");
+		return 1;
+	}
+
+	if (chmp->chm_unchecked_size) {
+		dbg_gc("unchecked\n");
+		return 1;
+	}
+
+	dirty = chmp->chm_dirty_size - chmp->chm_nr_erasable_blocks *
+	    chmp->chm_ebh->eb_size;
+
+	if (chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks <
+	    chmp->chm_resv_blocks_gctrigger && (dirty > chmp->chm_nospc_dirty)) {
+		dbg_gc("free: %d + erasable: %d < resv: %d\n",
+		    chmp->chm_nr_free_blocks, chmp->chm_nr_erasable_blocks,
+		    chmp->chm_resv_blocks_gctrigger);
+		dbg_gc("dirty: %d > nospc_dirty: %d\n",
+		    dirty, chmp->chm_nospc_dirty);
+
+		return 1;
+	}
+
+	TAILQ_FOREACH(cheb, &chmp->chm_very_dirty_queue, queue) {
+		nr_very_dirty++;
+		if (nr_very_dirty == chmp->chm_vdirty_blocks_gctrigger) {
+			dbg_gc("nr_very_dirty\n");
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+void
+chfs_gc_release_inode(struct chfs_mount *chmp,
+    struct chfs_inode *ip)
+{
+	dbg_gc("release inode\n");
+	//mutex_exit(&ip->inode_lock);
+	//vput(ITOV(ip));
+}
+
+struct chfs_inode *
+chfs_gc_fetch_inode(struct chfs_mount *chmp, ino_t vno,
+    uint32_t unlinked)
+{
+	struct vnode *vp = NULL;
+	struct chfs_vnode_cache *vc;
+	struct chfs_inode *ip;
+	dbg_gc("fetch inode %llu\n", (unsigned long long)vno);
+
+	if (unlinked) {
+		dbg_gc("unlinked\n");
+		vp = chfs_vnode_lookup(chmp, vno);
+		if (!vp) {
+			mutex_enter(&chmp->chm_lock_vnocache);
+			vc = chfs_vnode_cache_get(chmp, vno);
+			if (!vc) {
+				mutex_exit(&chmp->chm_lock_vnocache);
+				return NULL;
+			}
+			if (vc->state != VNO_STATE_CHECKEDABSENT) {
+				//sleep_on_spinunlock(&chmp->chm_lock_vnocache);
+				mutex_exit(&chmp->chm_lock_vnocache);
+				/* XXX why do we need the delay here?! */
+//				kpause("chvncabs", true, mstohz(50), NULL);
+				KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+				cv_timedwait_sig(
+					&chmp->chm_gc_thread.gcth_wakeup,
+					&chmp->chm_lock_mountfields, mstohz(50));
+
+//				KASSERT(!mutex_owned(&chmp->chm_lock_vnocache));
+			} else {
+				mutex_exit(&chmp->chm_lock_vnocache);
+			}
+			return NULL;
+		}
+	} else {
+		dbg_gc("vnode lookup\n");
+		vp = chfs_vnode_lookup(chmp, vno);
+		//VFS_VGET(chmp->chm_fsmp, vno, &vp);
+	}
+	dbg_gc("vp to ip\n");
+	ip = VTOI(vp);
+	KASSERT(ip);
+	//mutex_enter(&ip->inode_lock);
+
+	return ip;
+}
+
+extern rb_tree_ops_t frag_rbtree_ops;
+
+int
+chfs_check(struct chfs_mount *chmp, struct  chfs_vnode_cache *chvc)
+{
+	struct chfs_inode *ip;
+	struct vnode *vp;
+	int ret;
+
+	ip = pool_get(&chfs_inode_pool, PR_WAITOK);
+	if (!ip) {
+		return ENOMEM;
+	}
+
+	vp = kmem_zalloc(sizeof(struct vnode), KM_SLEEP);
+
+	ip->chvc = chvc;
+	ip->vp = vp;
+
+	vp->v_data = ip;
+
+	rb_tree_init(&ip->fragtree, &frag_rbtree_ops);
+	TAILQ_INIT(&ip->dents);
+
+	ret = chfs_read_inode_internal(chmp, ip);
+	if (!ret) {
+		chfs_clear_inode(chmp, ip);
+	}
+
+	pool_put(&chfs_inode_pool, ip);
+
+	return ret;
+}
+
+void
+chfs_clear_inode(struct chfs_mount *chmp, struct chfs_inode *ip)
+{
+	struct chfs_dirent *fd, *tmpfd;
+	struct chfs_vnode_cache *chvc;
+
+
+	/* XXX not sure if this is the correct locking */
+//	mutex_enter(&chmp->chm_lock_vnocache);
+	chvc = ip->chvc;
+	/* shouldnt this be: */
+	//bool deleted = (chvc && !(chvc->pvno || chvc->nlink));
+	int deleted = (chvc && !(chvc->pvno | chvc->nlink));
+
+	if (chvc && chvc->state != VNO_STATE_CHECKING) {
+//		chfs_vnode_cache_state_set(chmp, chvc, VNO_STATE_CLEARING);
+		chvc->state = VNO_STATE_CLEARING;
+	}
+
+	if (chvc->v && ((struct  chfs_vnode_cache *)chvc->v != chvc)) {
+		if (deleted)
+			chfs_mark_node_obsolete(chmp, chvc->v);
+		//chfs_free_refblock(chvc->v);
+	}
+//	mutex_enter(&chmp->chm_lock_vnocache);
+
+	chfs_kill_fragtree(&ip->fragtree);
+/*
+	fd = TAILQ_FIRST(&ip->dents);
+	while (fd) {
+		TAILQ_REMOVE(&ip->dents, fd, fds);
+		chfs_free_dirent(fd);
+		fd = TAILQ_FIRST(&ip->dents);
+	}
+*/
+
+	TAILQ_FOREACH_SAFE(fd, &ip->dents, fds, tmpfd) {
+		chfs_free_dirent(fd);
+	}
+
+	if (chvc && chvc->state == VNO_STATE_CHECKING) {
+		chfs_vnode_cache_set_state(chmp,
+		    chvc, VNO_STATE_CHECKEDABSENT);
+		if ((struct chfs_vnode_cache *)chvc->v == chvc &&
+		    (struct chfs_vnode_cache *)chvc->dirents == chvc &&
+		    (struct chfs_vnode_cache *)chvc->dnode == chvc)
+			chfs_vnode_cache_remove(chmp, chvc);
+	}
+
+}
+
+struct chfs_eraseblock *
+find_gc_block(struct chfs_mount *chmp)
+{
+	struct chfs_eraseblock *ret;
+	struct chfs_eraseblock_queue *nextqueue;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	struct timespec now;
+	vfs_timestamp(&now);
+
+	int n = now.tv_nsec % 128;
+
+	//dbg_gc("n = %d\n", n);
+again:
+/*    if (!TAILQ_EMPTY(&chmp->chm_bad_used_queue) && chmp->chm_nr_free_blocks > chmp->chm_nr_resv_blocks_gcbad) {
+      dbg_gc("Picking block from bad_used_queue to GC next\n");
+      nextqueue = &chmp->chm_bad_used_queue;
+      } else */if (n<50 && !TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+		dbg_gc("Picking block from erase_pending_queue to GC next\n");
+		nextqueue = &chmp->chm_erase_pending_queue;
+	} else if (n<110 && !TAILQ_EMPTY(&chmp->chm_very_dirty_queue) ) {
+		dbg_gc("Picking block from very_dirty_queue to GC next\n");
+		nextqueue = &chmp->chm_very_dirty_queue;
+	} else if (n<126 && !TAILQ_EMPTY(&chmp->chm_dirty_queue) ) {
+		dbg_gc("Picking block from dirty_queue to GC next\n");
+		nextqueue = &chmp->chm_dirty_queue;
+	} else if (!TAILQ_EMPTY(&chmp->chm_clean_queue)) {
+		dbg_gc("Picking block from clean_queue to GC next\n");
+		nextqueue = &chmp->chm_clean_queue;
+	} else if (!TAILQ_EMPTY(&chmp->chm_dirty_queue)) {
+		dbg_gc("Picking block from dirty_queue to GC next"
+		    " (clean_queue was empty)\n");
+		nextqueue = &chmp->chm_dirty_queue;
+	} else if (!TAILQ_EMPTY(&chmp->chm_very_dirty_queue)) {
+		dbg_gc("Picking block from very_dirty_queue to GC next"
+		    " (clean_queue and dirty_queue were empty)\n");
+		nextqueue = &chmp->chm_very_dirty_queue;
+	} else if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+		dbg_gc("Picking block from erase_pending_queue to GC next"
+		    " (clean_queue and {very_,}dirty_queue were empty)\n");
+		nextqueue = &chmp->chm_erase_pending_queue;
+	} else if (!TAILQ_EMPTY(&chmp->chm_erasable_pending_wbuf_queue)) {
+		dbg_gc("Synching wbuf in order to reuse "
+		    "erasable_pendig_wbuf_queue blocks\n");
+		rw_enter(&chmp->chm_lock_wbuf, RW_WRITER);
+		chfs_flush_pending_wbuf(chmp);
+		rw_exit(&chmp->chm_lock_wbuf);
+		goto again;
+	} else {
+		dbg_gc("CHFS: no clean, dirty _or_ erasable"
+		    " blocks to GC from! Where are they all?\n");
+		return NULL;
+	}
+
+	ret = TAILQ_FIRST(nextqueue);
+	if (chmp->chm_nextblock) {
+		dbg_gc("nextblock num: %u - gcblock num: %u\n",
+		    chmp->chm_nextblock->lnr, ret->lnr);
+		if (ret == chmp->chm_nextblock)
+			goto again;
+		//KASSERT(ret != chmp->chm_nextblock);
+		//dbg_gc("first node lnr: %u ofs: %u\n", ret->first_node->lnr, ret->first_node->offset);
+		//dbg_gc("last node lnr: %u ofs: %u\n", ret->last_node->lnr, ret->last_node->offset);
+	}
+	TAILQ_REMOVE(nextqueue, ret, queue);
+	chmp->chm_gcblock = ret;
+	ret->gc_node = ret->first_node;
+
+	if (!ret->gc_node) {
+		dbg_gc("Oops! ret->gc_node at LEB: %u is NULL\n", ret->lnr);
+		panic("CHFS BUG - one LEB's gc_node is NULL\n");
+	}
+
+	/* TODO wasted size? */
+	return ret;
+}
+
+
+int
+chfs_gcollect_pass(struct chfs_mount *chmp)
+{
+	struct chfs_vnode_cache *vc;
+	struct chfs_eraseblock *eb;
+	struct chfs_node_ref *nref;
+	uint32_t gcblock_dirty;
+	struct chfs_inode *ip;
+	ino_t vno, pvno;
+	uint32_t nlink;
+	int ret = 0;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+//	mutex_enter(&chmp->chm_lock_mountfields);
+	for (;;) {
+		mutex_enter(&chmp->chm_lock_sizes);
+
+		dbg_gc("unchecked size == %u\n", chmp->chm_unchecked_size);
+		if (!chmp->chm_unchecked_size)
+			break;
+
+		if (chmp->chm_checked_vno > chmp->chm_max_vno) {
+			mutex_exit(&chmp->chm_lock_sizes);
+			mutex_exit(&chmp->chm_lock_mountfields);
+			dbg_gc("checked_vno (#%llu) > max_vno (#%llu)\n",
+			    (unsigned long long)chmp->chm_checked_vno,
+			    (unsigned long long)chmp->chm_max_vno);
+			return ENOSPC;
+		}
+
+		mutex_exit(&chmp->chm_lock_sizes);
+
+		mutex_enter(&chmp->chm_lock_vnocache);
+		dbg_gc("checking vno #%llu\n",
+			(unsigned long long)chmp->chm_checked_vno);
+		dbg_gc("get vnode cache\n");
+		vc = chfs_vnode_cache_get(chmp, chmp->chm_checked_vno++);
+
+		if (!vc) {
+			dbg_gc("!vc\n");
+			mutex_exit(&chmp->chm_lock_vnocache);
+			continue;
+		}
+
+		if ((vc->pvno | vc->nlink) == 0) {
+			dbg_gc("(pvno | nlink) == 0\n");
+			mutex_exit(&chmp->chm_lock_vnocache);
+			continue;
+		}
+
+		dbg_gc("switch\n");
+		switch (vc->state) {
+		case VNO_STATE_CHECKEDABSENT:
+		case VNO_STATE_PRESENT:
+			mutex_exit(&chmp->chm_lock_vnocache);
+			continue;
+
+		case VNO_STATE_GC:
+		case VNO_STATE_CHECKING:
+			mutex_exit(&chmp->chm_lock_vnocache);
+			mutex_exit(&chmp->chm_lock_mountfields);
+			dbg_gc("VNO_STATE GC or CHECKING\n");
+			panic("CHFS BUG - vc state gc or checking\n");
+
+		case VNO_STATE_READING:
+			chmp->chm_checked_vno--;
+			mutex_exit(&chmp->chm_lock_vnocache);
+			/* XXX why do we need the delay here?! */
+			kpause("chvncrea", true, mstohz(50), NULL);
+
+//			sleep_on_spinunlock(&chmp->chm_lock_vnocache);
+//			KASSERT(!mutex_owned(&chmp->chm_lock_vnocache));
+			mutex_exit(&chmp->chm_lock_mountfields);
+			return 0;
+
+		default:
+			mutex_exit(&chmp->chm_lock_vnocache);
+			mutex_exit(&chmp->chm_lock_mountfields);
+			dbg_gc("default\n");
+			panic("CHFS BUG - vc state is other what we"
+			    " checked\n");
+
+		case VNO_STATE_UNCHECKED:
+			;
+		}
+
+		chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_CHECKING);
+
+		/* XXX check if this is too heavy to call under
+		 * chm_lock_vnocache
+		 */
+		ret = chfs_check(chmp, vc);
+		dbg_gc("set state\n");
+		chfs_vnode_cache_set_state(chmp,
+		    vc, VNO_STATE_CHECKEDABSENT);
+
+		mutex_exit(&chmp->chm_lock_vnocache);
+		mutex_exit(&chmp->chm_lock_mountfields);
+
+		return ret;
+	}
+
+
+	eb = chmp->chm_gcblock;
+
+	if (!eb) {
+		eb = find_gc_block(chmp);
+	}
+
+	if (!eb) {
+		dbg_gc("!eb\n");
+		if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+			mutex_exit(&chmp->chm_lock_sizes);
+			mutex_exit(&chmp->chm_lock_mountfields);
+			return EAGAIN;
+		}
+		mutex_exit(&chmp->chm_lock_sizes);
+		mutex_exit(&chmp->chm_lock_mountfields);
+		return EIO;
+	}
+
+	if (!eb->used_size) {
+		dbg_gc("!eb->used_size\n");
+		goto eraseit;
+	}
+
+	nref = eb->gc_node;
+	//dbg_gc("gc use: %u\n", chmp->chm_nextblock->lnr);
+	//dbg_gc("nref: %u %u\n", nref->nref_lnr, nref->nref_offset);
+	gcblock_dirty = eb->dirty_size;
+
+	while(CHFS_REF_OBSOLETE(nref)) {
+		//dbg_gc("obsoleted nref lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset);
+#ifdef DBG_MSG_GC
+		if (nref == chmp->chm_blocks[nref->nref_lnr].last_node) {
+			dbg_gc("THIS NODE IS THE LAST NODE OF ITS EB\n");
+		}
+#endif
+		nref = node_next(nref);
+		if (!nref) {
+			//dbg_gc("!nref\n");
+			eb->gc_node = nref;
+			mutex_exit(&chmp->chm_lock_sizes);
+			mutex_exit(&chmp->chm_lock_mountfields);
+			panic("CHFS BUG - nref is NULL)\n");
+		}
+	}
+	eb->gc_node = nref;
+	//dbg_gc("nref the chosen one lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset);
+	KASSERT(nref->nref_lnr == chmp->chm_gcblock->lnr);
+
+	if (!nref->nref_next) {
+		//dbg_gc("!nref->nref_next\n");
+		mutex_exit(&chmp->chm_lock_sizes);
+		if (CHFS_REF_FLAGS(nref) == CHFS_PRISTINE_NODE_MASK) {
+			chfs_gcollect_pristine(chmp, eb, NULL, nref);
+		} else {
+			chfs_mark_node_obsolete(chmp, nref);
+		}
+		goto lock_size;
+	}
+
+	dbg_gc("nref lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset);
+	vc = chfs_nref_to_vc(nref);
+
+	mutex_exit(&chmp->chm_lock_sizes);
+
+	//dbg_gc("enter vnocache lock on #%llu\n", vc->vno);
+	mutex_enter(&chmp->chm_lock_vnocache);
+
+	dbg_gc("switch\n");
+	switch(vc->state) {
+        case VNO_STATE_CHECKEDABSENT:
+		if (CHFS_REF_FLAGS(nref) == CHFS_PRISTINE_NODE_MASK) {
+			chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_GC);
+		}
+		break;
+
+        case VNO_STATE_PRESENT:
+		break;
+
+        case VNO_STATE_UNCHECKED:
+        case VNO_STATE_CHECKING:
+        case VNO_STATE_GC:
+		mutex_exit(&chmp->chm_lock_vnocache);
+		mutex_exit(&chmp->chm_lock_mountfields);
+		panic("CHFS BUG - vc state unchecked,"
+		    " checking or gc (vno #%llu, num #%d)\n",
+		    (unsigned long long)vc->vno, vc->state);
+
+        case VNO_STATE_READING:
+		mutex_exit(&chmp->chm_lock_vnocache);
+		/* XXX why do we need the delay here?! */
+		kpause("chvncrea", true, mstohz(50), NULL);
+
+//		sleep_on_spinunlock(&chmp->chm_lock_vnocache);
+//		KASSERT(!mutex_owned(&chmp->chm_lock_vnocache));
+		mutex_exit(&chmp->chm_lock_mountfields);
+		return 0;
+	}
+
+	if (vc->state == VNO_STATE_GC) {
+		dbg_gc("vc->state == VNO_STATE_GC\n");
+		mutex_exit(&chmp->chm_lock_vnocache);
+		ret = chfs_gcollect_pristine(chmp, eb, NULL, nref);
+
+//		chfs_vnode_cache_state_set(chmp,
+//		    vc, VNO_STATE_CHECKEDABSENT);
+		/* XXX locking? */
+		vc->state = VNO_STATE_CHECKEDABSENT;
+		//TODO wake_up(&chmp->chm_vnocache_wq);
+		if (ret != EBADF)
+			goto test_gcnode;
+		mutex_enter(&chmp->chm_lock_vnocache);
+	}
+
+	vno = vc->vno;
+	pvno = vc->pvno;
+	nlink = vc->nlink;
+	mutex_exit(&chmp->chm_lock_vnocache);
+
+	ip = chfs_gc_fetch_inode(chmp, vno, !(pvno | nlink));
+
+	if (!ip) {
+		dbg_gc("!ip\n");
+		ret = 0;
+		goto lock_size;
+	}
+
+	chfs_gcollect_live(chmp, eb, nref, ip);
+
+	chfs_gc_release_inode(chmp, ip);
+
+test_gcnode:
+	if (eb->dirty_size == gcblock_dirty &&
+	    !CHFS_REF_OBSOLETE(eb->gc_node)) {
+		dbg_gc("ERROR collecting node at %u failed.\n",
+		    CHFS_GET_OFS(eb->gc_node->nref_offset));
+
+		ret = ENOSPC;
+	}
+
+lock_size:
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+	mutex_enter(&chmp->chm_lock_sizes);
+eraseit:
+	dbg_gc("eraseit\n");
+
+	if (chmp->chm_gcblock) {
+		dbg_gc("eb used size = %u\n", chmp->chm_gcblock->used_size);
+		dbg_gc("eb free size = %u\n", chmp->chm_gcblock->free_size);
+		dbg_gc("eb dirty size = %u\n", chmp->chm_gcblock->dirty_size);
+		dbg_gc("eb unchecked size = %u\n",
+		    chmp->chm_gcblock->unchecked_size);
+		dbg_gc("eb wasted size = %u\n", chmp->chm_gcblock->wasted_size);
+
+		KASSERT(chmp->chm_gcblock->used_size + chmp->chm_gcblock->free_size +
+		    chmp->chm_gcblock->dirty_size +
+		    chmp->chm_gcblock->unchecked_size +
+		    chmp->chm_gcblock->wasted_size == chmp->chm_ebh->eb_size);
+
+	}
+
+	if (chmp->chm_gcblock && chmp->chm_gcblock->dirty_size +
+	    chmp->chm_gcblock->wasted_size == chmp->chm_ebh->eb_size) {
+		dbg_gc("Block at leb #%u completely obsoleted by GC, "
+		    "Moving to erase_pending_queue\n", chmp->chm_gcblock->lnr);
+		TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue,
+		    chmp->chm_gcblock, queue);
+		chmp->chm_gcblock = NULL;
+		chmp->chm_nr_erasable_blocks++;
+		if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+			ret = chfs_remap_leb(chmp);
+		}
+	}
+
+	mutex_exit(&chmp->chm_lock_sizes);
+	mutex_exit(&chmp->chm_lock_mountfields);
+	dbg_gc("return\n");
+	return ret;
+}
+
+
+int
+chfs_gcollect_pristine(struct chfs_mount *chmp, struct chfs_eraseblock *cheb,
+    struct chfs_vnode_cache *chvc, struct chfs_node_ref *nref)
+{
+	struct chfs_node_ref *newnref;
+	struct chfs_flash_node_hdr *nhdr;
+	struct chfs_flash_vnode *fvnode;
+	struct chfs_flash_dirent_node *fdirent;
+	struct chfs_flash_data_node *fdata;
+	int ret, retries = 0;
+	uint32_t ofs, crc;
+	size_t totlen = chfs_nref_len(chmp, cheb, nref);
+	char *data;
+	struct iovec vec;
+	size_t retlen;
+
+	dbg_gc("gcollect_pristine\n");
+
+	data = kmem_alloc(totlen, KM_SLEEP);
+	if (!data)
+		return ENOMEM;
+
+	ofs = CHFS_GET_OFS(nref->nref_offset);
+
+	ret = chfs_read_leb(chmp, nref->nref_lnr, data, ofs, totlen, &retlen);
+	if (ret) {
+		dbg_gc("reading error\n");
+		return ret;
+	}
+	if (retlen != totlen) {
+		dbg_gc("read size error\n");
+		return EIO;
+	}
+	nhdr = (struct chfs_flash_node_hdr *)data;
+	/* check the header */
+	if (le16toh(nhdr->magic) != CHFS_FS_MAGIC_BITMASK) {
+		dbg_gc("node header magic number error\n");
+		return EBADF;
+	}
+	crc = crc32(0, (uint8_t *)nhdr, CHFS_NODE_HDR_SIZE - 4);
+	if (crc != le32toh(nhdr->hdr_crc)) {
+		dbg_gc("node header crc error\n");
+		return EBADF;
+	}
+
+	switch(le16toh(nhdr->type)) {
+        case CHFS_NODETYPE_VNODE:
+		fvnode = (struct chfs_flash_vnode *)data;
+	        crc = crc32(0, (uint8_t *)fvnode, sizeof(struct chfs_flash_vnode) - 4);
+	        if (crc != le32toh(fvnode->node_crc)) {
+			dbg_gc("vnode crc error\n");
+			return EBADF;
+		}
+		break;
+        case CHFS_NODETYPE_DIRENT:
+		fdirent = (struct chfs_flash_dirent_node *)data;
+	        crc = crc32(0, (uint8_t *)fdirent, sizeof(struct chfs_flash_dirent_node) - 4);
+	        if (crc != le32toh(fdirent->node_crc)) {
+			dbg_gc("dirent crc error\n");
+			return EBADF;
+		}
+	        crc = crc32(0, fdirent->name, fdirent->nsize);
+	        if (crc != le32toh(fdirent->name_crc)) {
+			dbg_gc("dirent name crc error\n");
+			return EBADF;
+		}
+		break;
+        case CHFS_NODETYPE_DATA:
+		fdata = (struct chfs_flash_data_node *)data;
+	        crc = crc32(0, (uint8_t *)fdata, sizeof(struct chfs_flash_data_node) - 4);
+	        if (crc != le32toh(fdata->node_crc)) {
+			dbg_gc("data node crc error\n");
+			return EBADF;
+		}
+		break;
+        default:
+		if (chvc) {
+			dbg_gc("unknown node have vnode cache\n");
+			return EBADF;
+		}
+	}
+	/* CRC's OK, write node to its new place */
+retry:
+	ret = chfs_reserve_space_gc(chmp, totlen);
+	if (ret)
+		return ret;
+
+	newnref = chfs_alloc_node_ref(chmp->chm_nextblock);
+	if (!newnref)
+		return ENOMEM;
+
+	ofs = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+	newnref->nref_offset = ofs;
+
+	vec.iov_base = (void *)data;
+	vec.iov_len = totlen;
+	mutex_enter(&chmp->chm_lock_sizes);
+	ret = chfs_write_wbuf(chmp, &vec, 1, ofs, &retlen);
+
+	if (ret || retlen != totlen) {
+		chfs_err("error while writing out to the media\n");
+		chfs_err("err: %d | size: %zu | retlen : %zu\n",
+		    ret, totlen, retlen);
+
+		chfs_change_size_dirty(chmp, chmp->chm_nextblock, totlen);
+		if (retries) {
+			mutex_exit(&chmp->chm_lock_sizes);
+			return EIO;
+		}
+
+		retries++;
+		mutex_exit(&chmp->chm_lock_sizes);
+		goto retry;
+	}
+
+	mutex_exit(&chmp->chm_lock_sizes);
+	//TODO should we set free_size?
+	chfs_mark_node_obsolete(chmp, nref);
+	chfs_add_vnode_ref_to_vc(chmp, chvc, newnref);
+	return 0;
+}
+
+
+int
+chfs_gcollect_live(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, struct chfs_node_ref *nref,
+    struct chfs_inode *ip)
+{
+	struct chfs_node_frag *frag;
+	struct chfs_full_dnode *fn = NULL;
+	int start = 0, end = 0, nrfrags = 0;
+	struct chfs_dirent *fd = NULL;
+	int ret = 0;
+	bool is_dirent;
+
+	dbg_gc("gcollect_live\n");
+
+	if (chmp->chm_gcblock != cheb) {
+		dbg_gc("GC block is no longer gcblock. Restart.\n");
+		goto upnout;
+	}
+
+	if (CHFS_REF_OBSOLETE(nref)) {
+		dbg_gc("node to be GC'd was obsoleted in the meantime.\n");
+		goto upnout;
+	}
+
+	/* It's a vnode? */
+	if (ip->chvc->v == nref) {
+		chfs_gcollect_vnode(chmp, ip);
+		goto upnout;
+	}
+
+	/* find fn */
+	dbg_gc("find full dnode\n");
+	for(frag = frag_first(&ip->fragtree);
+	    frag; frag = frag_next(&ip->fragtree, frag)) {
+		if (frag->node && frag->node->nref == nref) {
+			fn = frag->node;
+			end = frag->ofs + frag->size;
+			if (!nrfrags++)
+				start = frag->ofs;
+			if (nrfrags == frag->node->frags)
+				break;
+		}
+	}
+
+	/* It's a pristine node, or dnode (or hole? XXX have we hole nodes?) */
+	if (fn) {
+		if (CHFS_REF_FLAGS(nref) == CHFS_PRISTINE_NODE_MASK) {
+			ret = chfs_gcollect_pristine(chmp,
+			    cheb, ip->chvc, nref);
+			if (!ret) {
+				frag->node->nref = ip->chvc->v;
+			}
+			if (ret != EBADF)
+				goto upnout;
+		}
+		//ret = chfs_gcollect_hole(chmp, cheb, ip, fn, start, end);
+		ret = chfs_gcollect_dnode(chmp, cheb, ip, fn, start, end);
+		goto upnout;
+	}
+
+
+	/* It's a dirent? */
+	dbg_gc("find full dirent\n");
+	is_dirent = false;
+	TAILQ_FOREACH(fd, &ip->dents, fds) {
+		if (fd->nref == nref) {
+			is_dirent = true;
+			break;
+		}
+	}
+
+	if (is_dirent && fd->vno) {
+		ret = chfs_gcollect_dirent(chmp, cheb, ip, fd);
+	} else if (is_dirent) {
+		ret = chfs_gcollect_deletion_dirent(chmp, cheb, ip, fd);
+	} else {
+		dbg_gc("Nref at leb #%u offset 0x%08x wasn't in node list"
+		    " for ino #%llu\n",
+		    nref->nref_lnr, CHFS_GET_OFS(nref->nref_offset),
+		    (unsigned long long)ip->ino);
+		if (CHFS_REF_OBSOLETE(nref)) {
+			dbg_gc("But it's obsolete so we don't mind"
+			    " too much.\n");
+		}
+	}
+
+upnout:
+	return ret;
+}
+
+int
+chfs_gcollect_vnode(struct chfs_mount *chmp, struct chfs_inode *ip)
+{
+	int ret;
+	dbg_gc("gcollect_vnode\n");
+
+	ret = chfs_write_flash_vnode(chmp, ip, ALLOC_GC);
+
+	return ret;
+}
+
+int
+chfs_gcollect_dirent(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, struct chfs_inode *parent,
+    struct chfs_dirent *fd)
+{
+	struct vnode *vnode = NULL;
+	struct chfs_inode *ip;
+	struct chfs_node_ref *prev;
+	dbg_gc("gcollect_dirent\n");
+
+	vnode = chfs_vnode_lookup(chmp, fd->vno);
+
+	/* XXX maybe KASSERT or panic on this? */
+	if (vnode == NULL) {
+		return ENOENT;
+	}
+
+	ip = VTOI(vnode);
+
+	prev = parent->chvc->dirents;
+	if (prev == fd->nref) {
+		parent->chvc->dirents = prev->nref_next;
+		dbg_gc("fd nref removed from dirents list\n");
+		prev = NULL;
+	}
+	while (prev) {
+		if (prev->nref_next == fd->nref) {
+			prev->nref_next = fd->nref->nref_next;
+			dbg_gc("fd nref removed from dirents list\n");
+			break;
+		}
+		prev = prev->nref_next;
+	}
+
+	prev = fd->nref;
+	chfs_mark_node_obsolete(chmp, fd->nref);
+	return chfs_write_flash_dirent(chmp,
+	    parent, ip, fd, fd->vno, ALLOC_GC);
+}
+
+/* Check dirents what are marked as deleted. */
+int
+chfs_gcollect_deletion_dirent(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, struct chfs_inode *parent,
+    struct chfs_dirent *fd)
+{
+	struct chfs_flash_dirent_node chfdn;
+	struct chfs_node_ref *nref;
+	size_t retlen, name_len, nref_len;
+	uint32_t name_crc;
+
+	int ret;
+
+	struct vnode *vnode = NULL;
+
+	dbg_gc("gcollect_deletion_dirent\n");
+
+	name_len = strlen(fd->name);
+	name_crc = crc32(0, fd->name, name_len);
+
+	nref_len = chfs_nref_len(chmp, cheb, fd->nref);
+
+	vnode = chfs_vnode_lookup(chmp, fd->vno);
+
+	//dbg_gc("ip from vnode\n");
+	//VFS_VGET(chmp->chm_fsmp, fd->vno, &vnode);
+	//ip = VTOI(vnode);
+	//vput(vnode);
+
+	//dbg_gc("mutex enter erase_completion_lock\n");
+
+//	dbg_gc("alloc chfdn\n");
+//	chfdn = kmem_alloc(nref_len, KM_SLEEP);
+//	if (!chfdn)
+//		return ENOMEM;
+
+	for (nref = parent->chvc->dirents;
+	     nref != (void*)parent->chvc;
+	     nref = nref->nref_next) {
+
+		if (!CHFS_REF_OBSOLETE(nref))
+			continue;
+
+		/* if node refs have different length, skip */
+		if (chfs_nref_len(chmp, NULL, nref) != nref_len)
+			continue;
+
+		if (CHFS_GET_OFS(nref->nref_offset) ==
+		    CHFS_GET_OFS(fd->nref->nref_offset)) {
+			continue;
+		}
+
+		ret = chfs_read_leb(chmp,
+		    nref->nref_lnr, (void*)&chfdn, CHFS_GET_OFS(nref->nref_offset),
+		    nref_len, &retlen);
+
+		if (ret) {
+			dbg_gc("Read error: %d\n", ret);
+			continue;
+		}
+
+		if (retlen != nref_len) {
+			dbg_gc("Error reading node:"
+			    " read: %zu insted of: %zu\n", retlen, nref_len);
+			continue;
+		}
+
+		/* if node type doesn't match, skip */
+		if (le16toh(chfdn.type) != CHFS_NODETYPE_DIRENT)
+			continue;
+
+		/* if crc doesn't match, skip */
+		if (le32toh(chfdn.name_crc) != name_crc)
+			continue;
+
+		/* if length of name different, or this is an another deletion
+		 * dirent, skip
+		 */
+		if (chfdn.nsize != name_len || !le64toh(chfdn.vno))
+			continue;
+
+		/* check actual name */
+		if (memcmp(chfdn.name, fd->name, name_len))
+			continue;
+
+//		kmem_free(chfdn, nref_len);
+
+		chfs_mark_node_obsolete(chmp, fd->nref);
+		return chfs_write_flash_dirent(chmp,
+		    parent, NULL, fd, fd->vno, ALLOC_GC);
+	}
+
+//	kmem_free(chfdn, nref_len);
+
+	TAILQ_REMOVE(&parent->dents, fd, fds);
+	chfs_free_dirent(fd);
+	return 0;
+}
+
+int
+chfs_gcollect_dnode(struct chfs_mount *chmp,
+    struct chfs_eraseblock *orig_cheb, struct chfs_inode *ip,
+    struct chfs_full_dnode *fn, uint32_t orig_start, uint32_t orig_end)
+{
+	struct chfs_node_ref *nref, *prev;
+	struct chfs_full_dnode *newfn;
+	struct chfs_flash_data_node *fdnode;
+	int ret = 0, retries = 0;
+	uint32_t totlen;
+	char *data = NULL;
+	struct iovec vec;
+	size_t retlen;
+	dbg_gc("gcollect_dnode\n");
+
+	//uint32_t used_size;
+
+/* TODO GC merging frags, should we use it?
+
+   uint32_t start, end;
+
+   start = orig_start;
+   end = orig_end;
+
+   if (chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks > chmp->chm_resv_blocks_gcmerge) {
+   struct chfs_node_frag *frag;
+   uint32_t min, max;
+
+   min = start & (PAGE_CACHE_SIZE-1);
+   max = min + PAGE_CACHE_SIZE;
+
+   frag = (struct chfs_node_frag *)rb_tree_find_node_leq(&ip->i_chfs_ext.fragtree, &start);
+   KASSERT(frag->ofs == start);
+
+   while ((frag = frag_prev(&ip->i_chfs_ext.fragtree, frag)) && frag->ofs >= min) {
+   if (frag->ofs > min) {
+   start = frag->ofs;
+   continue;
+   }
+
+   if (!frag->node || !frag->node->nref) {
+   break;
+   } else {
+   struct chfs_node_ref *nref = frag->node->nref;
+   struct chfs_eraseblock *cheb;
+
+   cheb = &chmp->chm_blocks[nref->nref_lnr];
+
+   if (cheb == chmp->chm_gcblock)
+   start = frag->ofs;
+
+   //TODO is this a clean block?
+
+   start = frag->ofs;
+   break;
+   }
+   }
+
+   end--;
+   frag = (struct chfs_node_frag *)rb_tree_find_node_leq(&ip->i_chfs_ext.fragtree, &(end));
+
+   while ((frag = frag_next(&ip->i_chfs_ext.fragtree, frag)) && (frag->ofs + frag->size <= max)) {
+   if (frag->ofs + frag->size < max) {
+   end = frag->ofs + frag->size;
+   continue;
+   }
+
+   if (!frag->node || !frag->node->nref) {
+   break;
+   } else {
+   struct chfs_node_ref *nref = frag->node->nref;
+   struct chfs_eraseblock *cheb;
+
+   cheb = &chmp->chm_blocks[nref->nref_lnr];
+
+   if (cheb == chmp->chm_gcblock)
+   end = frag->ofs + frag->size;
+
+   //TODO is this a clean block?
+
+   end = frag->ofs + frag->size;
+   break;
+   }
+   }
+
+   KASSERT(end <=
+   frag_last(&ip->i_chfs_ext.fragtree)->ofs +
+   frag_last(&ip->i_chfs_ext.fragtree)->size);
+   KASSERT(end >= orig_end);
+   KASSERT(start <= orig_start);
+   }
+*/
+	KASSERT(orig_cheb->lnr == fn->nref->nref_lnr);
+	totlen = chfs_nref_len(chmp, orig_cheb, fn->nref);
+	data = kmem_alloc(totlen, KM_SLEEP);
+
+	ret = chfs_read_leb(chmp, fn->nref->nref_lnr, data, fn->nref->nref_offset,
+	    totlen, &retlen);
+
+	fdnode = (struct chfs_flash_data_node *)data;
+	fdnode->version = htole64(++ip->chvc->highest_version);
+	fdnode->node_crc = htole32(crc32(0, (uint8_t *)fdnode,
+		sizeof(*fdnode) - 4));
+
+	vec.iov_base = (void *)data;
+	vec.iov_len = totlen;
+
+retry:
+	ret = chfs_reserve_space_gc(chmp, totlen);
+	if (ret)
+		goto out;
+
+	nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+	if (!nref) {
+		ret = ENOMEM;
+		goto out;
+	}
+
+	mutex_enter(&chmp->chm_lock_sizes);
+
+	nref->nref_offset = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+	KASSERT(nref->nref_offset % 4 == 0);
+	chfs_change_size_free(chmp, chmp->chm_nextblock, -totlen);
+
+	ret = chfs_write_wbuf(chmp, &vec, 1, nref->nref_offset, &retlen);
+	if (ret || retlen != totlen) {
+		chfs_err("error while writing out to the media\n");
+		chfs_err("err: %d | size: %d | retlen : %zu\n",
+		    ret, totlen, retlen);
+		chfs_change_size_dirty(chmp, chmp->chm_nextblock, totlen);
+		if (retries) {
+			ret = EIO;
+			mutex_exit(&chmp->chm_lock_sizes);
+			goto out;
+		}
+
+		retries++;
+		mutex_exit(&chmp->chm_lock_sizes);
+		goto retry;
+	}
+
+	dbg_gc("new nref lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset);
+
+	chfs_change_size_used(chmp, &chmp->chm_blocks[nref->nref_lnr], totlen);
+	mutex_exit(&chmp->chm_lock_sizes);
+	KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+
+	newfn = chfs_alloc_full_dnode();
+	newfn->nref = nref;
+	newfn->ofs = fn->ofs;
+	newfn->size = fn->size;
+	newfn->frags = fn->frags;
+
+	//TODO should we remove fd from dnode list?
+
+	prev = ip->chvc->dnode;
+	if (prev == fn->nref) {
+		ip->chvc->dnode = prev->nref_next;
+		prev = NULL;
+	}
+	while (prev) {
+		if (prev->nref_next == fn->nref) {
+			prev->nref_next = fn->nref->nref_next;
+			break;
+		}
+		prev = prev->nref_next;
+	}
+
+	chfs_add_full_dnode_to_inode(chmp, ip, newfn);
+	chfs_add_node_to_list(chmp,
+	    ip->chvc, newfn->nref, &ip->chvc->dnode);
+
+out:
+	kmem_free(data, totlen);
+	return ret;
+}
diff --git a/sys/ufs/chfs/chfs_ihash.c b/sys/ufs/chfs/chfs_ihash.c
new file mode 100644
index 000000000..b16b00c6a
--- /dev/null
+++ b/sys/ufs/chfs/chfs_ihash.c
@@ -0,0 +1,220 @@
+/*	$NetBSD: chfs_ihash.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+/*
+ * Structures associated with inode cacheing.
+ */
+static LIST_HEAD(ihashhead, chfs_inode) *chfs_ihashtbl;
+static u_long	chfs_ihash;		/* size of hash table - 1 */
+#define INOHASH(device, inum)	(((device) + (inum)) & chfs_ihash)
+
+kmutex_t	chfs_ihash_lock;
+kmutex_t	chfs_hashlock;
+
+/*
+ * Initialize inode hash table.
+ */
+void
+chfs_ihashinit(void)
+{
+	dbg("initing\n");
+
+	mutex_init(&chfs_hashlock, MUTEX_DEFAULT, IPL_NONE);
+	mutex_init(&chfs_ihash_lock, MUTEX_DEFAULT, IPL_NONE);
+	chfs_ihashtbl = hashinit(desiredvnodes,
+	    HASH_LIST, true, &chfs_ihash);
+}
+
+/*
+ * Reinitialize inode hash table.
+ */
+
+void
+chfs_ihashreinit(void)
+{
+	struct chfs_inode *ip;
+	struct ihashhead *oldhash, *hash;
+	u_long oldmask, mask, val;
+	int i;
+
+	dbg("reiniting\n");
+
+	hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
+	mutex_enter(&chfs_ihash_lock);
+	oldhash = chfs_ihashtbl;
+	oldmask = chfs_ihash;
+	chfs_ihashtbl = hash;
+	chfs_ihash = mask;
+	for (i = 0; i <= oldmask; i++) {
+		while ((ip = LIST_FIRST(&oldhash[i])) != NULL) {
+			LIST_REMOVE(ip, hash_entry);
+			val = INOHASH(ip->dev, ip->ino);
+			LIST_INSERT_HEAD(&hash[val], ip, hash_entry);
+		}
+	}
+	mutex_exit(&chfs_ihash_lock);
+	hashdone(oldhash, HASH_LIST, oldmask);
+}
+
+/*
+ * Free inode hash table.
+ */
+void
+chfs_ihashdone(void)
+{
+	dbg("destroying\n");
+
+	hashdone(chfs_ihashtbl, HASH_LIST, chfs_ihash);
+	mutex_destroy(&chfs_hashlock);
+	mutex_destroy(&chfs_ihash_lock);
+}
+
+/*
+ * Use the device/inum pair to find the incore inode, and return a pointer
+ * to it. If it is in core, return it, even if it is locked.
+ */
+struct vnode *
+chfs_ihashlookup(dev_t dev, ino_t inum)
+{
+	struct chfs_inode *ip;
+	struct ihashhead *ipp;
+
+	dbg("dev: %ju, inum: %ju\n", (uintmax_t )dev, (uintmax_t )inum);
+
+	KASSERT(mutex_owned(&chfs_ihash_lock));
+
+	ipp = &chfs_ihashtbl[INOHASH(dev, inum)];
+	LIST_FOREACH(ip, ipp, hash_entry) {
+		if (inum == ip->ino && dev == ip->dev) {
+			break;
+		}
+	}
+
+	if (ip) {
+		return (ITOV(ip));
+	}
+
+	return (NULLVP);
+}
+
+/*
+ * Use the device/inum pair to find the incore inode, and return a pointer
+ * to it. If it is in core, but locked, wait for it.
+ */
+struct vnode *
+chfs_ihashget(dev_t dev, ino_t inum, int flags)
+{
+	struct ihashhead *ipp;
+	struct chfs_inode *ip;
+	struct vnode *vp;
+
+	dbg("search for ino\n");
+
+loop:
+	mutex_enter(&chfs_ihash_lock);
+	ipp = &chfs_ihashtbl[INOHASH(dev, inum)];
+	dbg("ipp: %p, chfs_ihashtbl: %p, ihash: %lu\n",
+	    ipp, chfs_ihashtbl, chfs_ihash);
+	LIST_FOREACH(ip, ipp, hash_entry) {
+		dbg("ip: %p\n", ip);
+		if (inum == ip->ino && dev == ip->dev) {
+//			printf("chfs_ihashget: found inode: %p\n", ip);
+			vp = ITOV(ip);
+			KASSERT(vp != NULL);
+			//dbg("found\n");
+			if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) {
+				//dbg("wait for #%llu\n", ip->ino);
+				mutex_exit(&chfs_ihash_lock);
+				goto loop;
+			}
+			/*
+			if (VOP_ISLOCKED(vp))
+				dbg("locked\n");
+			else
+				dbg("isn't locked\n");
+			*/
+			if (flags == 0) {
+				//dbg("no flags\n");
+				mutex_exit(&chfs_ihash_lock);
+			} else {
+				//dbg("vget\n");
+				mutex_enter(vp->v_interlock);
+				mutex_exit(&chfs_ihash_lock);
+				if (vget(vp, flags)) {
+					goto loop;
+				}
+				//dbg("got it\n");
+			}
+			//dbg("return\n");
+			return (vp);
+		}
+	}
+	//dbg("not found\n");
+	mutex_exit(&chfs_ihash_lock);
+	return (NULL);
+}
+
+/*
+ * Insert the inode into the hash table, and return it locked.
+ */
+void
+chfs_ihashins(struct chfs_inode *ip)
+{
+	struct ihashhead *ipp;
+
+	dbg("ip: %p\n", ip);
+
+	KASSERT(mutex_owned(&chfs_hashlock));
+
+	/* lock the inode, then put it on the appropriate hash list */
+	VOP_LOCK(ITOV(ip), LK_EXCLUSIVE);
+
+	mutex_enter(&chfs_ihash_lock);
+	ipp = &chfs_ihashtbl[INOHASH(ip->dev, ip->ino)];
+	LIST_INSERT_HEAD(ipp, ip, hash_entry);
+	mutex_exit(&chfs_ihash_lock);
+}
+
+/*
+ * Remove the inode from the hash table.
+ */
+void
+chfs_ihashrem(struct chfs_inode *ip)
+{
+	dbg("ip: %p\n", ip);
+
+	mutex_enter(&chfs_ihash_lock);
+	LIST_REMOVE(ip, hash_entry);
+	mutex_exit(&chfs_ihash_lock);
+}
+
diff --git a/include/ufs/chfs/chfs_inode.h b/sys/ufs/chfs/chfs_inode.h
similarity index 100%
rename from include/ufs/chfs/chfs_inode.h
rename to sys/ufs/chfs/chfs_inode.h
diff --git a/sys/ufs/chfs/chfs_malloc.c b/sys/ufs/chfs/chfs_malloc.c
new file mode 100644
index 000000000..3138acc00
--- /dev/null
+++ b/sys/ufs/chfs/chfs_malloc.c
@@ -0,0 +1,396 @@
+/*	$NetBSD: chfs_malloc.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+#include <sys/pool.h>
+
+pool_cache_t chfs_vnode_cache;
+pool_cache_t chfs_nrefs_cache;
+pool_cache_t chfs_flash_vnode_cache;
+pool_cache_t chfs_flash_dirent_cache;
+pool_cache_t chfs_flash_dnode_cache;
+pool_cache_t chfs_node_frag_cache;
+pool_cache_t chfs_tmp_dnode_cache;
+pool_cache_t chfs_tmp_dnode_info_cache;
+
+int
+chfs_alloc_pool_caches()
+{
+	chfs_vnode_cache = pool_cache_init(
+		sizeof(struct chfs_vnode_cache),
+		0, 0, 0, "chfs_vnode_cache", NULL, IPL_NONE, NULL, NULL,
+		NULL);
+	if (!chfs_vnode_cache)
+		goto err_vnode;
+
+	chfs_nrefs_cache = pool_cache_init(
+		(REFS_BLOCK_LEN + 1) * sizeof(struct chfs_node_ref), 0, 0,
+		0, "chfs_nrefs_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+	if (!chfs_nrefs_cache)
+		goto err_nrefs;
+
+	chfs_flash_vnode_cache = pool_cache_init(
+		sizeof(struct chfs_flash_vnode), 0, 0, 0,
+		"chfs_flash_vnode_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+	if (!chfs_flash_vnode_cache)
+		goto err_flash_vnode;
+
+	chfs_flash_dirent_cache = pool_cache_init(
+		sizeof(struct chfs_flash_dirent_node), 0, 0, 0,
+		"chfs_flash_dirent_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+	if (!chfs_flash_dirent_cache)
+		goto err_flash_dirent;
+
+	chfs_flash_dnode_cache = pool_cache_init(
+		sizeof(struct chfs_flash_data_node), 0, 0, 0,
+		"chfs_flash_dnode_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+	if (!chfs_flash_dnode_cache)
+		goto err_flash_dnode;
+
+	chfs_node_frag_cache = pool_cache_init(
+		sizeof(struct chfs_node_frag), 0, 0, 0,
+		"chfs_node_frag_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+	if (!chfs_node_frag_cache)
+		goto err_node_frag;
+
+	chfs_tmp_dnode_cache = pool_cache_init(
+		sizeof(struct chfs_tmp_dnode), 0, 0, 0,
+		"chfs_tmp_dnode_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+	if (!chfs_tmp_dnode_cache)
+		goto err_tmp_dnode;
+
+	chfs_tmp_dnode_info_cache = pool_cache_init(
+		sizeof(struct chfs_tmp_dnode_info), 0, 0, 0,
+		"chfs_tmp_dnode_info_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+	if (!chfs_tmp_dnode_info_cache)
+		goto err_tmp_dnode_info;
+
+	return 0;
+
+err_tmp_dnode_info:
+	pool_cache_destroy(chfs_tmp_dnode_cache);
+err_tmp_dnode:
+	pool_cache_destroy(chfs_node_frag_cache);
+err_node_frag:
+	pool_cache_destroy(chfs_flash_dnode_cache);
+err_flash_dnode:
+	pool_cache_destroy(chfs_flash_dirent_cache);
+err_flash_dirent:
+	pool_cache_destroy(chfs_flash_vnode_cache);
+err_flash_vnode:
+	pool_cache_destroy(chfs_nrefs_cache);
+err_nrefs:
+	pool_cache_destroy(chfs_vnode_cache);
+err_vnode:
+
+	return ENOMEM;
+}
+
+void
+chfs_destroy_pool_caches()
+{
+	if (chfs_vnode_cache)
+		pool_cache_destroy(chfs_vnode_cache);
+
+	if (chfs_nrefs_cache)
+		pool_cache_destroy(chfs_nrefs_cache);
+
+	if (chfs_flash_vnode_cache)
+		pool_cache_destroy(chfs_flash_vnode_cache);
+
+	if (chfs_flash_dirent_cache)
+		pool_cache_destroy(chfs_flash_dirent_cache);
+
+	if (chfs_flash_dnode_cache)
+		pool_cache_destroy(chfs_flash_dnode_cache);
+
+	if (chfs_node_frag_cache)
+		pool_cache_destroy(chfs_node_frag_cache);
+
+	if (chfs_tmp_dnode_cache)
+		pool_cache_destroy(chfs_tmp_dnode_cache);
+
+	if (chfs_tmp_dnode_info_cache)
+		pool_cache_destroy(chfs_tmp_dnode_info_cache);
+}
+
+struct chfs_vnode_cache *
+chfs_vnode_cache_alloc(ino_t vno)
+{
+	struct chfs_vnode_cache* vc;
+	vc = pool_cache_get(chfs_vnode_cache, PR_WAITOK);
+
+	memset(vc, 0, sizeof(*vc));
+	vc->vno = vno;
+	vc->v = (void *)vc;
+	vc->dirents = (void *)vc;
+	vc->dnode = (void *)vc;
+	TAILQ_INIT(&vc->scan_dirents);
+	vc->highest_version = 0;
+
+	return vc;
+}
+
+void
+chfs_vnode_cache_free(struct chfs_vnode_cache *vc)
+{
+	//kmem_free(vc->vno_version, sizeof(uint64_t));
+	pool_cache_put(chfs_vnode_cache, vc);
+}
+
+/**
+ * chfs_alloc_refblock - allocating a refblock
+ *
+ * Returns a pointer of the first element in the block.
+ *
+ * We are not allocating just one node ref, instead we allocating REFS_BLOCK_LEN
+ * number of node refs, the last element will be a pointer to the next block.
+ * We do this, because we need a chain of nodes which have been ordered by the
+ * physical address of them.
+ *
+ */
+struct chfs_node_ref*
+chfs_alloc_refblock(void)
+{
+	int i;
+	struct chfs_node_ref *nref;
+	nref = pool_cache_get(chfs_nrefs_cache, PR_WAITOK);
+
+	for (i = 0; i < REFS_BLOCK_LEN; i++) {
+		nref[i].nref_lnr = REF_EMPTY_NODE;
+		nref[i].nref_next = NULL;
+	}
+	i = REFS_BLOCK_LEN;
+	nref[i].nref_lnr = REF_LINK_TO_NEXT;
+	nref[i].nref_next = NULL;
+
+	return nref;
+}
+
+/**
+ * chfs_free_refblock - freeing a refblock
+ */
+void
+chfs_free_refblock(struct chfs_node_ref *nref)
+{
+	pool_cache_put(chfs_nrefs_cache, nref);
+}
+
+/**
+ * chfs_alloc_node_ref - allocating a node ref from a refblock
+ * @cheb: eraseblock information structure
+ *
+ * Allocating a node ref from a refblock, it there isn't any free element in the
+ * block, a new block will be allocated and be linked to the current block.
+ */
+struct chfs_node_ref*
+chfs_alloc_node_ref(struct chfs_eraseblock *cheb)
+{
+	struct chfs_node_ref *nref, *new, *old;
+	old = cheb->last_node;
+	nref = cheb->last_node;
+
+	if (!nref) {
+		//There haven't been any nref allocated for this block yet
+		nref = chfs_alloc_refblock();
+
+		cheb->first_node = nref;
+		cheb->last_node = nref;
+		nref->nref_lnr = cheb->lnr;
+		KASSERT(cheb->lnr == nref->nref_lnr);
+
+		return nref;
+	}
+
+	nref++;
+	if (nref->nref_lnr == REF_LINK_TO_NEXT) {
+		new = chfs_alloc_refblock();
+		nref->nref_next = new;
+		nref = new;
+	}
+
+	cheb->last_node = nref;
+	nref->nref_lnr = cheb->lnr;
+
+	KASSERT(old->nref_lnr == nref->nref_lnr &&
+	    nref->nref_lnr == cheb->lnr);
+
+	return nref;
+}
+
+/**
+ * chfs_free_node_refs - freeing an eraseblock's node refs
+ * @cheb: eraseblock information structure
+ */
+void
+chfs_free_node_refs(struct chfs_eraseblock *cheb)
+{
+	struct chfs_node_ref *nref, *block;
+
+	block = nref = cheb->first_node;
+
+	while (nref) {
+		if (nref->nref_lnr == REF_LINK_TO_NEXT) {
+			nref = nref->nref_next;
+			chfs_free_refblock(block);
+			block = nref;
+			continue;
+		}
+		nref++;
+	}
+}
+
+struct chfs_dirent*
+chfs_alloc_dirent(int namesize)
+{
+	struct chfs_dirent *ret;
+	size_t size = sizeof(struct chfs_dirent) + namesize;
+
+	ret = kmem_alloc(size, KM_SLEEP);
+	//ret->alloc_size = size;
+
+	return ret;
+}
+
+void
+chfs_free_dirent(struct chfs_dirent *dirent)
+{
+	//size_t size = dirent->alloc_size;
+	size_t size = sizeof(struct chfs_dirent) + dirent->nsize + 1;
+
+	kmem_free(dirent, size);
+}
+
+struct chfs_full_dnode*
+chfs_alloc_full_dnode()
+{
+	struct chfs_full_dnode *ret;
+	ret = kmem_alloc(sizeof(struct chfs_full_dnode), KM_SLEEP);
+	return ret;
+}
+
+void
+chfs_free_full_dnode(struct chfs_full_dnode *fd)
+{
+	kmem_free(fd,(sizeof(struct chfs_full_dnode)));
+}
+
+struct chfs_flash_vnode*
+chfs_alloc_flash_vnode()
+{
+	struct chfs_flash_vnode *ret;
+	ret = pool_cache_get(chfs_flash_vnode_cache, 0);
+	return ret;
+}
+
+void
+chfs_free_flash_vnode(struct chfs_flash_vnode *fvnode)
+{
+	pool_cache_put(chfs_flash_vnode_cache, fvnode);
+}
+
+struct chfs_flash_dirent_node*
+chfs_alloc_flash_dirent()
+{
+	struct chfs_flash_dirent_node *ret;
+	ret = pool_cache_get(chfs_flash_dirent_cache, 0);
+	return ret;
+}
+
+void
+chfs_free_flash_dirent(struct chfs_flash_dirent_node *fdnode)
+{
+	pool_cache_put(chfs_flash_dirent_cache, fdnode);
+}
+
+struct chfs_flash_data_node*
+chfs_alloc_flash_dnode()
+{
+	struct chfs_flash_data_node *ret;
+	ret = pool_cache_get(chfs_flash_dnode_cache, 0);
+	return ret;
+}
+
+void
+chfs_free_flash_dnode(struct chfs_flash_data_node *fdnode)
+{
+	pool_cache_put(chfs_flash_dnode_cache, fdnode);
+}
+
+
+struct chfs_node_frag*
+chfs_alloc_node_frag()
+{
+	struct chfs_node_frag *ret;
+	ret = pool_cache_get(chfs_node_frag_cache, 0);
+	return ret;
+
+}
+
+void
+chfs_free_node_frag(struct chfs_node_frag *frag)
+{
+	pool_cache_put(chfs_node_frag_cache, frag);
+}
+
+struct chfs_tmp_dnode *
+chfs_alloc_tmp_dnode()
+{
+	struct chfs_tmp_dnode *ret;
+	ret = pool_cache_get(chfs_tmp_dnode_cache, 0);
+	ret->next = NULL;
+	return ret;
+}
+
+void
+chfs_free_tmp_dnode(struct chfs_tmp_dnode *td)
+{
+	pool_cache_put(chfs_tmp_dnode_cache, td);
+}
+
+struct chfs_tmp_dnode_info *
+chfs_alloc_tmp_dnode_info()
+{
+	struct chfs_tmp_dnode_info *ret;
+	ret = pool_cache_get(chfs_tmp_dnode_info_cache, 0);
+	ret->tmpnode = NULL;
+	return ret;
+}
+
+void
+chfs_free_tmp_dnode_info(struct chfs_tmp_dnode_info *di)
+{
+	pool_cache_put(chfs_tmp_dnode_info_cache, di);
+}
+
diff --git a/sys/ufs/chfs/chfs_nodeops.c b/sys/ufs/chfs/chfs_nodeops.c
new file mode 100644
index 000000000..bf761dd66
--- /dev/null
+++ b/sys/ufs/chfs/chfs_nodeops.c
@@ -0,0 +1,570 @@
+/*	$NetBSD: chfs_nodeops.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * Copyright (C) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+
+/**
+ * chfs_update_eb_dirty - updates dirty and free space, first and
+ *			      last node references
+ * @sbi: CHFS main descriptor structure
+ * @cheb: eraseblock to update
+ * @size: increase dirty space size with this
+ * Returns zero in case of success, %1 in case of fail.
+ */
+int
+chfs_update_eb_dirty(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, uint32_t size)
+{
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+	KASSERT(!mutex_owned(&chmp->chm_lock_sizes));
+
+	if (!size)
+		return 0;
+
+	if (size > cheb->free_size) {
+		chfs_err("free_size (%d) is less then dirty space (%d) "
+		    "on block (%d)\n", cheb->free_size, size, cheb->lnr);
+		return 1;
+	}
+	mutex_enter(&chmp->chm_lock_sizes);
+	//dbg("BEFORE: free_size: %d\n", cheb->free_size);
+	chfs_change_size_free(chmp, cheb, -size);
+	chfs_change_size_dirty(chmp, cheb, size);
+	//dbg(" AFTER: free_size: %d\n", cheb->free_size);
+	mutex_exit(&chmp->chm_lock_sizes);
+	return 0;
+}
+
+/**
+ * chfs_add_node_to_list - adds a data node ref to vnode cache's dnode list
+ * @sbi: super block informations
+ * @new: node ref to insert
+ * @list: head of the list
+ * This function inserts a data node ref to the list of vnode cache.
+ * The list is sorted by data node's lnr and offset.
+ */
+void
+chfs_add_node_to_list(struct chfs_mount *chmp,
+    struct chfs_vnode_cache *vc,
+    struct chfs_node_ref *new, struct chfs_node_ref **list)
+{
+	struct chfs_node_ref *nextref = *list;
+	struct chfs_node_ref *prevref = NULL;
+
+	while (nextref && nextref != (struct chfs_node_ref *)vc &&
+	    (nextref->nref_lnr <= new->nref_lnr)) {
+		if (nextref->nref_lnr == new->nref_lnr) {
+			while (nextref && nextref !=
+			    (struct chfs_node_ref *)vc &&
+			    (CHFS_GET_OFS(nextref->nref_offset) <
+				CHFS_GET_OFS(new->nref_offset))) {
+				prevref = nextref;
+				nextref = nextref->nref_next;
+			}
+			break;
+		}
+		prevref = nextref;
+		nextref = nextref->nref_next;
+	}
+
+	if (nextref && nextref != (struct chfs_node_ref *)vc &&
+	    nextref->nref_lnr == new->nref_lnr &&
+	    CHFS_GET_OFS(nextref->nref_offset) ==
+	    CHFS_GET_OFS(new->nref_offset)) {
+		new->nref_next = nextref->nref_next;
+	} else {
+		new->nref_next = nextref;
+	}
+
+	if (prevref) {
+		prevref->nref_next = new;
+	} else {
+		*list = new;
+	}
+}
+
+void
+chfs_add_fd_to_inode(struct chfs_mount *chmp,
+    struct chfs_inode *parent, struct chfs_dirent *new)
+{
+//	struct chfs_dirent **prev = &parent->dents;
+	struct chfs_dirent *fd, *tmpfd;
+
+	if (new->version > parent->chvc->highest_version) {
+		parent->chvc->highest_version = new->version;
+	}
+
+	//mutex_enter(&parent->inode_lock);
+	TAILQ_FOREACH_SAFE(fd, &parent->dents, fds, tmpfd) {
+		if (fd->nhash > new->nhash) {
+			/* insert new before fd */
+			TAILQ_INSERT_BEFORE(fd, new, fds);
+			return;
+		} else if (fd->nhash == new->nhash &&
+		    !strcmp(fd->name, new->name)) {
+			if (new->version > fd->version) {
+//				new->next = fd->next;
+				/* replace fd with new */
+				TAILQ_INSERT_BEFORE(fd, new, fds);
+				TAILQ_REMOVE(&parent->dents, fd, fds);
+				if (fd->nref) {
+					chfs_mark_node_obsolete(chmp,
+					    fd->nref);
+				}
+				chfs_free_dirent(fd);
+//				*prev = new;//XXX
+			} else {
+				chfs_mark_node_obsolete(chmp, new->nref);
+				chfs_free_dirent(new);
+			}
+			return;
+		}
+	}
+	/* if we couldnt fit it elsewhere, lets add to the end */
+	/* FIXME insert tail or insert head? */
+	TAILQ_INSERT_HEAD(&parent->dents, new, fds);
+	//mutex_exit(&parent->inode_lock);
+#if 0
+   	while ((*prev) && (*prev)->nhash <= new->nhash) {
+		if ((*prev)->nhash == new->nhash &&
+		    !strcmp((*prev)->name, new->name)) {
+			if (new->version > (*prev)->version) {
+				new->next = (*prev)->next;
+				if ((*prev)->nref) {
+					chfs_mark_node_obsolete(chmp,
+					    (*prev)->nref);
+				}
+				chfs_free_dirent(*prev);
+				*prev = new;
+			} else {
+				chfs_mark_node_obsolete(chmp, new->nref);
+				chfs_free_dirent(new);
+			}
+			return;
+		}
+		prev = &((*prev)->next);
+	}
+
+	new->next = *prev;
+	*prev = new;
+#endif
+}
+
+void
+chfs_add_vnode_ref_to_vc(struct chfs_mount *chmp,
+    struct chfs_vnode_cache *vc, struct chfs_node_ref *new)
+{
+	if ((struct chfs_vnode_cache*)(vc->v) != vc) {
+		chfs_mark_node_obsolete(chmp, vc->v);
+		new->nref_next = vc->v->nref_next;
+	} else {
+		new->nref_next = vc->v;
+	}
+	vc->v = new;
+}
+
+struct chfs_node_ref *
+chfs_nref_next(struct chfs_node_ref *nref)
+{
+//	dbg("check nref: %u - %u\n", nref->nref_lnr, nref->nref_offset);
+	nref++;
+//	dbg("next nref: %u - %u\n", nref->nref_lnr, nref->nref_offset);
+	if (nref->nref_lnr == REF_LINK_TO_NEXT) {
+		//End of chain
+		if (!nref->nref_next)
+			return NULL;
+
+		nref = nref->nref_next;
+	}
+	//end of chain
+	if (nref->nref_lnr == REF_EMPTY_NODE)
+		return NULL;
+
+	return nref;
+}
+
+int
+chfs_nref_len(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, struct chfs_node_ref *nref)
+{
+	struct chfs_node_ref *next;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	if (!cheb)
+		cheb = &chmp->chm_blocks[nref->nref_lnr];
+
+	next = chfs_nref_next(nref);
+
+	if (!next) {
+		//dbg("next null\n");
+		return chmp->chm_ebh->eb_size - cheb->free_size -
+		    CHFS_GET_OFS(nref->nref_offset);
+	}
+	//dbg("size: %d\n", CHFS_GET_OFS(next->nref_offset) - CHFS_GET_OFS(nref->nref_offset));
+	return CHFS_GET_OFS(next->nref_offset) -
+	    CHFS_GET_OFS(nref->nref_offset);
+}
+
+/**
+ * chfs_mark_node_obsolete - marks a node obsolete
+ */
+void
+chfs_mark_node_obsolete(struct chfs_mount *chmp,
+    struct chfs_node_ref *nref)
+{
+	int len;
+	struct chfs_eraseblock *cheb;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	KASSERT(!CHFS_REF_OBSOLETE(nref));
+
+	KASSERT(nref->nref_lnr <= chmp->chm_ebh->peb_nr);
+	cheb = &chmp->chm_blocks[nref->nref_lnr];
+
+#ifdef DIAGNOSTIC
+	if (cheb->used_size + cheb->free_size + cheb->dirty_size +
+	    cheb->unchecked_size + cheb->wasted_size != chmp->chm_ebh->eb_size) {
+		dbg("eraseblock leak detected!\nused: %u\nfree: %u\n"
+		    "dirty: %u\nunchecked: %u\nwasted: %u\ntotal: %u\nshould be: %zu\n",
+		    cheb->used_size, cheb->free_size, cheb->dirty_size,
+		    cheb->unchecked_size, cheb->wasted_size, cheb->used_size + cheb->free_size +
+		    cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size,
+		    chmp->chm_ebh->eb_size);
+	}
+#endif
+
+	len = chfs_nref_len(chmp, cheb, nref);
+	//dbg("len: %u\n", len);
+	//dbg("1. used: %u\n", cheb->used_size);
+
+	mutex_enter(&chmp->chm_lock_sizes);
+	
+	if (CHFS_REF_FLAGS(nref) == CHFS_UNCHECKED_NODE_MASK) {
+		//dbg("UNCHECKED mark an unchecked node\n");
+		chfs_change_size_unchecked(chmp, cheb, -len);
+		//dbg("unchecked: %u\n", chmp->chm_unchecked_size);
+	} else {
+		chfs_change_size_used(chmp, cheb, -len);
+
+		//dbg("2. used: %u\n", cheb->used_size);
+		KASSERT(cheb->used_size <= chmp->chm_ebh->eb_size);
+	}
+	chfs_change_size_dirty(chmp, cheb, len);
+
+#ifdef DIAGNOSTIC
+	if (cheb->used_size + cheb->free_size + cheb->dirty_size +
+	    cheb->unchecked_size + cheb->wasted_size != chmp->chm_ebh->eb_size) {
+		panic("eraseblock leak detected!\nused: %u\nfree: %u\n"
+		    "dirty: %u\nunchecked: %u\nwasted: %u\ntotal: %u\nshould be: %zu\n",
+		    cheb->used_size, cheb->free_size, cheb->dirty_size,
+		    cheb->unchecked_size, cheb->wasted_size, cheb->used_size + cheb->free_size +
+		    cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size,
+		    chmp->chm_ebh->eb_size);
+	}
+#endif
+	nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) |
+	    CHFS_OBSOLETE_NODE_MASK;
+
+	if (chmp->chm_flags & CHFS_MP_FLAG_SCANNING) {
+		/*Scan is in progress, do nothing now*/
+		mutex_exit(&chmp->chm_lock_sizes);
+		return;
+	}
+
+	if (cheb == chmp->chm_nextblock) {
+		dbg("Not moving nextblock to dirty/erase_pending list\n");
+	} else if (!cheb->used_size && !cheb->unchecked_size) {
+		if (cheb == chmp->chm_gcblock) {
+			dbg("gcblock is completely dirtied\n");
+			chmp->chm_gcblock = NULL;
+		} else {
+			//remove from a tailq, but we don't know which tailq contains this cheb
+			//so we remove it from the dirty list now
+			//TAILQ_REMOVE(&chmp->chm_dirty_queue, cheb, queue);
+			int removed = 0;
+			struct chfs_eraseblock *eb, *tmpeb;
+			//XXX ugly code
+			TAILQ_FOREACH_SAFE(eb, &chmp->chm_free_queue, queue, tmpeb) {
+				if (eb == cheb) {
+					TAILQ_REMOVE(&chmp->chm_free_queue, cheb, queue);
+					removed = 1;
+					break;
+				}
+			}
+			if (removed == 0) {
+				TAILQ_FOREACH_SAFE(eb, &chmp->chm_dirty_queue, queue, tmpeb) {
+					if (eb == cheb) {
+						TAILQ_REMOVE(&chmp->chm_dirty_queue, cheb, queue);
+						removed = 1;
+						break;
+					}
+				}
+			}
+			if (removed == 0) {
+				TAILQ_FOREACH_SAFE(eb, &chmp->chm_very_dirty_queue, queue, tmpeb) {
+					if (eb == cheb) {
+						TAILQ_REMOVE(&chmp->chm_very_dirty_queue, cheb, queue);
+						removed = 1;
+						break;
+					}
+				}
+			}
+			if (removed == 0) {
+				TAILQ_FOREACH_SAFE(eb, &chmp->chm_clean_queue, queue, tmpeb) {
+					if (eb == cheb) {
+						TAILQ_REMOVE(&chmp->chm_clean_queue, cheb, queue);
+						removed = 1;
+						break;
+					}
+				}
+			}
+		}
+		if (chmp->chm_wbuf_len) {
+			dbg("Adding block to erasable pending wbuf queue\n");
+			TAILQ_INSERT_TAIL(&chmp->chm_erasable_pending_wbuf_queue,
+			    cheb, queue);
+		} else {
+			TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue,
+			    cheb, queue);
+			chmp->chm_nr_erasable_blocks++;
+		}
+		chfs_remap_leb(chmp);
+	} else if (cheb == chmp->chm_gcblock) {
+		dbg("Not moving gcblock to dirty list\n");
+	} else if (cheb->dirty_size > MAX_DIRTY_TO_CLEAN &&
+	    cheb->dirty_size - len <= MAX_DIRTY_TO_CLEAN) {
+		dbg("Freshly dirtied, remove it from clean queue and "
+		    "add it to dirty\n");
+		TAILQ_REMOVE(&chmp->chm_clean_queue, cheb, queue);
+		TAILQ_INSERT_TAIL(&chmp->chm_dirty_queue, cheb, queue);
+	} else if (VERY_DIRTY(chmp, cheb->dirty_size) &&
+	    !VERY_DIRTY(chmp, cheb->dirty_size - len)) {
+		dbg("Becomes now very dirty, remove it from dirty "
+		    "queue and add it to very dirty\n");
+		TAILQ_REMOVE(&chmp->chm_dirty_queue, cheb, queue);
+		TAILQ_INSERT_TAIL(&chmp->chm_very_dirty_queue, cheb, queue);
+	} else {
+		dbg("Leave cheb where it is\n");
+	}
+	mutex_exit(&chmp->chm_lock_sizes);
+	return;
+}
+
+/**
+ * chfs_close_eraseblock - close an eraseblock
+ * @chmp: chfs mount structure
+ * @cheb: eraseblock informations
+ *
+ * This function close the physical chain of the nodes on the eraseblock,
+ * convert its free size to dirty and add it to clean, dirty or very dirty list.
+ */
+int
+chfs_close_eraseblock(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb)
+{
+	uint32_t offset;
+	struct chfs_node_ref *nref;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	offset = chmp->chm_ebh->eb_size - cheb->free_size;
+
+	// Close the chain
+	nref = chfs_alloc_node_ref(cheb);
+	if (!nref)
+		return ENOMEM;
+
+	nref->nref_next = NULL;
+	nref->nref_offset = offset;
+
+	// Mark space as dirty
+	chfs_update_eb_dirty(chmp, cheb, cheb->free_size);
+
+	if (cheb->dirty_size < MAX_DIRTY_TO_CLEAN) {
+		TAILQ_INSERT_TAIL(&chmp->chm_clean_queue, cheb, queue);
+	} else if (VERY_DIRTY(chmp, cheb->dirty_size)) {
+		TAILQ_INSERT_TAIL(&chmp->chm_very_dirty_queue, cheb, queue);
+	} else {
+		TAILQ_INSERT_TAIL(&chmp->chm_dirty_queue, cheb, queue);
+	}
+	return 0;
+}
+
+int
+chfs_reserve_space_normal(struct chfs_mount *chmp, uint32_t size, int prio)
+{
+	int ret;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	mutex_enter(&chmp->chm_lock_sizes);
+	while (chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks < chmp->chm_resv_blocks_write) {
+		dbg("free: %d, erasable: %d, resv: %d\n", chmp->chm_nr_free_blocks, chmp->chm_nr_erasable_blocks, chmp->chm_resv_blocks_write);
+		uint32_t avail, dirty;
+		if (prio == ALLOC_DELETION && chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks >= chmp->chm_resv_blocks_deletion)
+			break;
+
+		dirty = chmp->chm_dirty_size - chmp->chm_nr_erasable_blocks * chmp->chm_ebh->eb_size + chmp->chm_unchecked_size;
+		if (dirty < chmp->chm_nospc_dirty) {
+			dbg("dirty: %u < nospc_dirty: %u\n", dirty, chmp->chm_nospc_dirty);
+			ret = ENOSPC;
+			mutex_exit(&chmp->chm_lock_sizes);
+			goto out;
+		}
+
+		avail = chmp->chm_free_size - (chmp->chm_resv_blocks_write * chmp->chm_ebh->eb_size);
+		if (size > avail) {
+			dbg("size: %u > avail: %u\n", size, avail);
+			ret = ENOSPC;
+			mutex_exit(&chmp->chm_lock_sizes);
+			goto out;
+		}
+
+		mutex_exit(&chmp->chm_lock_sizes);
+		ret = chfs_gcollect_pass(chmp);
+		/* gcollect_pass exits chm_lock_mountfields */
+		mutex_enter(&chmp->chm_lock_mountfields);
+		mutex_enter(&chmp->chm_lock_sizes);
+
+		if (chmp->chm_nr_erasable_blocks ||
+		    !TAILQ_EMPTY(&chmp->chm_erasable_pending_wbuf_queue) ||
+		    ret == EAGAIN) {
+			ret = chfs_remap_leb(chmp);
+		}
+
+		if (ret) {
+			mutex_exit(&chmp->chm_lock_sizes);
+			goto out;
+		}
+	}
+
+	mutex_exit(&chmp->chm_lock_sizes);
+	ret = chfs_reserve_space(chmp, size);
+out:
+	return ret;
+}
+
+
+int
+chfs_reserve_space_gc(struct chfs_mount *chmp, uint32_t size)
+{
+	int ret;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	mutex_enter(&chmp->chm_lock_sizes);
+	chfs_remap_leb(chmp);
+
+	if (size > chmp->chm_free_size) {
+		dbg("size: %u\n", size);
+		mutex_exit(&chmp->chm_lock_sizes);
+		return ENOSPC;
+	}
+
+	mutex_exit(&chmp->chm_lock_sizes);
+	ret = chfs_reserve_space(chmp, size);
+	return ret;
+}
+
+/**
+ * chfs_reserve_space - finds a block which free size is >= requested size
+ * @chmp: chfs mount point
+ * @size: requested size
+ * @len: reserved spaced will be returned in this variable;
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+chfs_reserve_space(struct chfs_mount *chmp, uint32_t size)
+{
+	//TODO define minimum reserved blocks, which is needed for writing
+	//TODO check we have enough free blocks to write
+	//TODO if no: need erase and GC
+
+	int err;
+	struct chfs_eraseblock *cheb;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+	KASSERT(!mutex_owned(&chmp->chm_lock_sizes));
+
+	cheb = chmp->chm_nextblock;
+	//if (cheb)
+	    //dbg("cheb->free_size %u\n", cheb->free_size);
+	if (cheb && size > cheb->free_size) {
+		dbg("size: %u > free_size: %u\n", size, cheb->free_size);
+		/*
+		 * There isn't enough space on this eraseblock, we mark this as
+		 * dirty and close the physical chain of the node refs.
+		 */
+		//Write out pending data if any
+		if (chmp->chm_wbuf_len) {
+			chfs_flush_pending_wbuf(chmp);
+			//FIXME need goto restart here?
+		}
+
+		while (chmp->chm_wbuf_ofs < chmp->chm_ebh->eb_size) {
+			dbg("wbuf ofs: %zu - eb_size: %zu\n",
+			    chmp->chm_wbuf_ofs, chmp->chm_ebh->eb_size);
+			chfs_flush_pending_wbuf(chmp);
+		}
+
+		if (!(chmp->chm_wbuf_ofs % chmp->chm_ebh->eb_size) && !chmp->chm_wbuf_len)
+			chmp->chm_wbuf_ofs = 0xffffffff;
+
+		err = chfs_close_eraseblock(chmp, cheb);
+		if (err)
+			return err;
+
+		cheb = NULL;
+	}
+	if (!cheb) {
+		//get a block for nextblock
+		if (TAILQ_EMPTY(&chmp->chm_free_queue)) {
+			// If this succeeds there will be a block on free_queue
+			dbg("cheb remap (free: %d)\n", chmp->chm_nr_free_blocks);
+			err = chfs_remap_leb(chmp);
+			if (err)
+				return err;
+		}
+		cheb = TAILQ_FIRST(&chmp->chm_free_queue);
+		TAILQ_REMOVE(&chmp->chm_free_queue, cheb, queue);
+		chmp->chm_nextblock = cheb;
+		chmp->chm_nr_free_blocks--;
+	}
+
+	return 0;
+}
+
diff --git a/sys/ufs/chfs/chfs_pool.c b/sys/ufs/chfs/chfs_pool.c
new file mode 100644
index 000000000..6e25d17f2
--- /dev/null
+++ b/sys/ufs/chfs/chfs_pool.c
@@ -0,0 +1,211 @@
+/*	$NetBSD: chfs_pool.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Pool allocator and convenience routines for chfs.
+ */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/pool.h>
+#include <sys/atomic.h>
+
+#include <uvm/uvm.h>
+
+#include "chfs.h"
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
+/* --------------------------------------------------------------------- */
+
+void *	chfs_pool_page_alloc(struct pool *, int);
+void	chfs_pool_page_free(struct pool *, void *);
+
+extern void*	pool_page_alloc_nointr(struct pool *, int);
+extern void	pool_page_free_nointr(struct pool *, void *);
+
+/* --------------------------------------------------------------------- */
+
+struct pool_allocator chfs_pool_allocator = {
+	.pa_alloc = chfs_pool_page_alloc,
+	.pa_free = chfs_pool_page_free,
+};
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_pool_init(struct chfs_pool *chpp, size_t size, const char *what,
+    struct chfs_mount *chmp)
+{
+	int cnt;
+
+	cnt = snprintf(chpp->chp_name, sizeof(chpp->chp_name),
+	    "%s_chfs_%p", what, chmp);
+	KASSERT(cnt < sizeof(chpp->chp_name));
+
+	pool_init(&chpp->chp_pool, size, 0, 0, 0, chpp->chp_name,
+	    &chfs_pool_allocator, IPL_NONE);
+	chpp->chp_mount = chmp;
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_pool_destroy(struct chfs_pool *chpp)
+{
+	pool_destroy((struct pool *)chpp);
+}
+
+/* --------------------------------------------------------------------- */
+
+void *
+chfs_pool_page_alloc(struct pool *pp, int flags)
+{
+	struct chfs_pool *chpp;
+	struct chfs_mount *chmp;
+	unsigned int pages;
+	void *page;
+	dbg("CHFS: pool_page_alloc()\n");
+
+	chpp = (struct chfs_pool *)pp;
+	chmp = chpp->chp_mount;
+
+	pages = atomic_inc_uint_nv(&chmp->chm_pages_used);
+	if (pages >= CHFS_PAGES_MAX(chmp)) {
+		atomic_dec_uint(&chmp->chm_pages_used);
+		return NULL;
+	}
+	page = pool_page_alloc_nointr(pp, flags | PR_WAITOK);
+	if (page == NULL) {
+		atomic_dec_uint(&chmp->chm_pages_used);
+	}
+
+	return page;
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_pool_page_free(struct pool *pp, void *v)
+{
+	struct chfs_pool *chpp;
+	struct chfs_mount *chmp;
+	dbg("CHFS: pool_page_free()\n");
+
+	chpp = (struct chfs_pool *)pp;
+	chmp = chpp->chp_mount;
+
+	atomic_dec_uint(&chmp->chm_pages_used);
+	pool_page_free_nointr(pp, v);
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_str_pool_init(struct chfs_str_pool *chsp, struct chfs_mount *chmp)
+{
+	dbg("CHFS: str_pool_init()\n");
+
+	chfs_pool_init(&chsp->chsp_pool_16,   16,   "str", chmp);
+	chfs_pool_init(&chsp->chsp_pool_32,   32,   "str", chmp);
+	chfs_pool_init(&chsp->chsp_pool_64,   64,   "str", chmp);
+	chfs_pool_init(&chsp->chsp_pool_128,  128,  "str", chmp);
+	chfs_pool_init(&chsp->chsp_pool_256,  256,  "str", chmp);
+	chfs_pool_init(&chsp->chsp_pool_512,  512,  "str", chmp);
+	chfs_pool_init(&chsp->chsp_pool_1024, 1024, "str", chmp);
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_str_pool_destroy(struct chfs_str_pool *chsp)
+{
+	dbg("CHFS: str_pool_destroy()\n");
+
+	chfs_pool_destroy(&chsp->chsp_pool_16);
+	chfs_pool_destroy(&chsp->chsp_pool_32);
+	chfs_pool_destroy(&chsp->chsp_pool_64);
+	chfs_pool_destroy(&chsp->chsp_pool_128);
+	chfs_pool_destroy(&chsp->chsp_pool_256);
+	chfs_pool_destroy(&chsp->chsp_pool_512);
+	chfs_pool_destroy(&chsp->chsp_pool_1024);
+}
+
+/* --------------------------------------------------------------------- */
+
+char *
+chfs_str_pool_get(struct chfs_str_pool *chsp, size_t len, int flags)
+{
+	struct chfs_pool *p;
+	dbg("CHFS: str_pool_get()\n");
+
+	KASSERT(len <= 1024);
+
+	if      (len <= 16)   p = &chsp->chsp_pool_16;
+	else if (len <= 32)   p = &chsp->chsp_pool_32;
+	else if (len <= 64)   p = &chsp->chsp_pool_64;
+	else if (len <= 128)  p = &chsp->chsp_pool_128;
+	else if (len <= 256)  p = &chsp->chsp_pool_256;
+	else if (len <= 512)  p = &chsp->chsp_pool_512;
+	else if (len <= 1024) p = &chsp->chsp_pool_1024;
+	else {
+		KASSERT(0);
+		p = NULL; /* Silence compiler warnings */
+	}
+
+	return (char *)CHFS_POOL_GET(p, flags);
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_str_pool_put(struct chfs_str_pool *chsp, char *str, size_t len)
+{
+	struct chfs_pool *p;
+	dbg("CHFS: str_pool_put()\n");
+
+	KASSERT(len <= 1024);
+
+	if      (len <= 16)   p = &chsp->chsp_pool_16;
+	else if (len <= 32)   p = &chsp->chsp_pool_32;
+	else if (len <= 64)   p = &chsp->chsp_pool_64;
+	else if (len <= 128)  p = &chsp->chsp_pool_128;
+	else if (len <= 256)  p = &chsp->chsp_pool_256;
+	else if (len <= 512)  p = &chsp->chsp_pool_512;
+	else if (len <= 1024) p = &chsp->chsp_pool_1024;
+	else {
+		KASSERT(0);
+		p = NULL; /* Silence compiler warnings */
+	}
+
+	CHFS_POOL_PUT(p, str);
+}
diff --git a/include/ufs/chfs/chfs_pool.h b/sys/ufs/chfs/chfs_pool.h
similarity index 100%
rename from include/ufs/chfs/chfs_pool.h
rename to sys/ufs/chfs/chfs_pool.h
diff --git a/sys/ufs/chfs/chfs_readinode.c b/sys/ufs/chfs/chfs_readinode.c
new file mode 100644
index 000000000..3ae626f8d
--- /dev/null
+++ b/sys/ufs/chfs/chfs_readinode.c
@@ -0,0 +1,1136 @@
+/*	$NetBSD: chfs_readinode.c,v 1.2 2011/11/24 21:09:37 agc Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * Copyright (C) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * chfs_readinode.c
+ *
+ *  Created on: 2010.05.31.
+ *      Author: dtengeri
+ */
+
+#include <sys/buf.h>
+
+#include "chfs.h"
+
+/* tmp node operations */
+int chfs_check_td_data(struct chfs_mount *,
+    struct chfs_tmp_dnode *);
+int chfs_check_td_node(struct chfs_mount *,
+    struct chfs_tmp_dnode *);
+struct chfs_node_ref *chfs_first_valid_data_ref(struct chfs_node_ref *);
+int chfs_add_tmp_dnode_to_tree(struct chfs_mount *,
+    struct chfs_readinode_info *,
+    struct chfs_tmp_dnode *);
+void chfs_add_tmp_dnode_to_tdi(struct chfs_tmp_dnode_info *,
+	struct chfs_tmp_dnode *);
+void chfs_remove_tmp_dnode_from_tdi(struct chfs_tmp_dnode_info *,
+	struct chfs_tmp_dnode *);
+static void chfs_kill_td(struct chfs_mount *,
+    struct chfs_tmp_dnode *);
+static void chfs_kill_tdi(struct chfs_mount *,
+    struct chfs_tmp_dnode_info *);
+/* frag node operations */
+struct chfs_node_frag *new_fragment(struct chfs_full_dnode *,
+    uint32_t,
+    uint32_t);
+int no_overlapping_node(struct rb_tree *, struct chfs_node_frag *,
+    struct chfs_node_frag *, uint32_t);
+int chfs_add_frag_to_fragtree(struct chfs_mount *,
+    struct rb_tree *,
+    struct chfs_node_frag *);
+void chfs_obsolete_node_frag(struct chfs_mount *,
+    struct chfs_node_frag *);
+/* general node operations */
+int chfs_get_data_nodes(struct chfs_mount *,
+    struct chfs_inode *,
+    struct chfs_readinode_info *);
+int chfs_build_fragtree(struct chfs_mount *,
+    struct chfs_inode *,
+    struct chfs_readinode_info *);
+
+
+
+/*
+ * --------------------------
+ * tmp node rbtree operations
+ * --------------------------
+ */
+static signed int
+tmp_node_compare_nodes(void *ctx, const void *n1, const void *n2)
+{
+	const struct chfs_tmp_dnode_info *tdi1 = n1;
+	const struct chfs_tmp_dnode_info *tdi2 = n2;
+
+	return (tdi1->tmpnode->node->ofs - tdi2->tmpnode->node->ofs);
+}
+
+static signed int
+tmp_node_compare_key(void *ctx, const void *n, const void *key)
+{
+	const struct chfs_tmp_dnode_info *tdi = n;
+	uint64_t ofs =  *(const uint64_t *)key;
+
+	return (tdi->tmpnode->node->ofs - ofs);
+}
+
+const rb_tree_ops_t tmp_node_rbtree_ops = {
+	.rbto_compare_nodes = tmp_node_compare_nodes,
+	.rbto_compare_key = tmp_node_compare_key,
+	.rbto_node_offset = offsetof(struct chfs_tmp_dnode_info, rb_node),
+	.rbto_context = NULL
+};
+
+
+/*
+ * ---------------------------
+ * frag node rbtree operations
+ * ---------------------------
+ */
+static signed int
+frag_compare_nodes(void *ctx, const void *n1, const void *n2)
+{
+	const struct chfs_node_frag *frag1 = n1;
+	const struct chfs_node_frag *frag2 = n2;
+
+	return (frag1->ofs - frag2->ofs);
+}
+
+static signed int
+frag_compare_key(void *ctx, const void *n, const void *key)
+{
+	const struct chfs_node_frag *frag = n;
+	uint64_t ofs = *(const uint64_t *)key;
+
+	return (frag->ofs - ofs);
+}
+
+const rb_tree_ops_t frag_rbtree_ops = {
+	.rbto_compare_nodes = frag_compare_nodes,
+	.rbto_compare_key   = frag_compare_key,
+	.rbto_node_offset = offsetof(struct chfs_node_frag, rb_node),
+	.rbto_context = NULL
+};
+
+
+/*
+ * -------------------
+ * tmp node operations
+ * -------------------
+ */
+/*
+ * Check the data CRC of the node.
+ *
+ * Returns: 0 - if everything OK;
+ * 	    	1 - if CRC is incorrect;
+ * 	    	2 - else;
+ *	    	error code if an error occured.
+ */
+int
+chfs_check_td_data(struct chfs_mount *chmp,
+    struct chfs_tmp_dnode *td)
+{
+	int err;
+	size_t retlen, len, totlen;
+	uint32_t crc;
+	uint64_t ofs;
+	char *buf;
+	struct chfs_node_ref *nref = td->node->nref;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+	KASSERT(!mutex_owned(&chmp->chm_lock_sizes));
+
+	ofs = CHFS_GET_OFS(nref->nref_offset) + sizeof(struct chfs_flash_data_node);
+	len = td->node->size;
+	if (!len)
+		return 0;
+
+	buf = kmem_alloc(len, KM_SLEEP);
+	if (!buf) {
+		dbg("allocating error\n");
+		return 2;
+	}
+	err = chfs_read_leb(chmp, nref->nref_lnr, buf, ofs, len, &retlen);
+	if (err) {
+		dbg("error wile reading: %d\n", err);
+		err = 2;
+		goto out;
+	}
+
+	if (len != retlen) {
+		dbg("len:%zu, retlen:%zu\n", len, retlen);
+		err = 2;
+		goto out;
+	}
+	crc = crc32(0, (uint8_t *)buf, len);
+
+	if (crc != td->data_crc) {
+		dbg("crc failed, calculated: 0x%x, orig: 0x%x\n", crc, td->data_crc);
+		kmem_free(buf, len);
+		return 1;
+	}
+
+	nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) | CHFS_NORMAL_NODE_MASK;
+	totlen = CHFS_PAD(sizeof(struct chfs_flash_data_node) + len);
+
+	mutex_enter(&chmp->chm_lock_sizes);
+	chfs_change_size_unchecked(chmp, &chmp->chm_blocks[nref->nref_lnr], -totlen);
+	chfs_change_size_used(chmp, &chmp->chm_blocks[nref->nref_lnr], totlen);
+	mutex_exit(&chmp->chm_lock_sizes);
+	KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+
+	err = 0;
+out:
+	kmem_free(buf, len);
+	return err;
+}
+
+int
+chfs_check_td_node(struct chfs_mount *chmp, struct chfs_tmp_dnode *td)
+{
+	int ret;
+
+	if (CHFS_REF_FLAGS(td->node->nref) != CHFS_UNCHECKED_NODE_MASK)
+		return 0;
+
+	ret = chfs_check_td_data(chmp, td);
+	if (ret == 1) {
+		chfs_mark_node_obsolete(chmp, td->node->nref);
+	}
+	return ret;
+}
+
+
+struct chfs_node_ref *
+chfs_first_valid_data_ref(struct chfs_node_ref *nref)
+{
+	while (nref) {
+		if (!CHFS_REF_OBSOLETE(nref)) {
+#ifdef DGB_MSG_GC
+			if (nref->nref_lnr == REF_EMPTY_NODE) {
+				dbg("FIRST VALID IS EMPTY!\n");
+			}
+#endif
+			return nref;
+		}
+
+		if (nref->nref_next) {
+			nref = nref->nref_next;
+		} else
+			break;
+	}
+	return NULL;
+}
+
+void
+chfs_add_tmp_dnode_to_tdi(struct chfs_tmp_dnode_info *tdi,
+	struct chfs_tmp_dnode *td)
+{
+	if (!tdi->tmpnode) {
+		tdi->tmpnode = td;
+	} else {
+		struct chfs_tmp_dnode *tmp = tdi->tmpnode;
+		while (tmp->next) {
+			tmp = tmp->next;
+		}
+		tmp->next = td;
+	}
+}
+
+void
+chfs_remove_tmp_dnode_from_tdi(struct chfs_tmp_dnode_info *tdi,
+	struct chfs_tmp_dnode *td)
+{
+	if (tdi->tmpnode == td) {
+		tdi->tmpnode = tdi->tmpnode->next;
+	} else {
+		struct chfs_tmp_dnode *tmp = tdi->tmpnode->next;
+		while (tmp->next && tmp->next != td) {
+			tmp = tmp->next;
+		}
+		if (tmp->next) {
+			tmp->next = td->next;
+		}
+	}
+}
+
+static void
+chfs_kill_td(struct chfs_mount *chmp,
+    struct chfs_tmp_dnode *td)
+{
+	/* check if we need to mark as obsolete, to avoid double mark */
+	if (!CHFS_REF_OBSOLETE(td->node->nref)) {
+		chfs_mark_node_obsolete(chmp, td->node->nref);
+	}
+
+	chfs_free_tmp_dnode(td);
+}
+
+static void
+chfs_kill_tdi(struct chfs_mount *chmp,
+    struct chfs_tmp_dnode_info *tdi)
+{
+	struct chfs_tmp_dnode *next, *tmp = tdi->tmpnode;
+
+	while (tmp) {
+		next = tmp->next;
+		chfs_kill_td(chmp, tmp);
+		tmp = next;
+	}
+
+	chfs_free_tmp_dnode_info(tdi);
+}
+
+int
+chfs_add_tmp_dnode_to_tree(struct chfs_mount *chmp,
+    struct chfs_readinode_info *rii,
+    struct chfs_tmp_dnode *newtd)
+{
+	uint64_t end_ofs = newtd->node->ofs + newtd->node->size;
+	struct chfs_tmp_dnode_info *this;
+	struct rb_node *node, *prev_node;
+	struct chfs_tmp_dnode_info *newtdi;
+
+	node = rb_tree_find_node(&rii->tdi_root, &newtd->node->ofs);
+	if (node) {
+		this = (struct chfs_tmp_dnode_info *)node;
+		while (this->tmpnode->overlapped) {
+			prev_node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_LEFT);
+			if (!prev_node) {
+				this->tmpnode->overlapped = 0;
+				break;
+			}
+			node = prev_node;
+			this = (struct chfs_tmp_dnode_info *)node;
+		}
+	}
+	while (node) {
+		this = (struct chfs_tmp_dnode_info *)node;
+		if (this->tmpnode->node->ofs > end_ofs)
+			break;
+		
+		struct chfs_tmp_dnode *tmp_td = this->tmpnode;
+		while (tmp_td) {
+			if (tmp_td->version == newtd->version) {
+				if (!chfs_check_td_node(chmp, tmp_td)) {
+					dbg("calling kill td 0\n");
+					chfs_kill_td(chmp, newtd);
+					return 0;
+				} else {
+					chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+					chfs_kill_td(chmp, tmp_td);
+					chfs_add_tmp_dnode_to_tdi(this, newtd);
+					return 0;
+				}
+			}
+			if (tmp_td->version < newtd->version &&
+				tmp_td->node->ofs >= newtd->node->ofs &&
+				tmp_td->node->ofs + tmp_td->node->size <= end_ofs) {
+				/* New node entirely overlaps 'this' */
+				if (chfs_check_td_node(chmp, newtd)) {
+					dbg("calling kill td 2\n");
+					chfs_kill_td(chmp, newtd);
+					return 0;
+				}
+				/* ... and is good. Kill 'this' and any subsequent nodes which are also overlapped */
+				while (tmp_td && tmp_td->node->ofs + tmp_td->node->size <= end_ofs) {
+					struct rb_node *next = rb_tree_iterate(&rii->tdi_root, this, RB_DIR_RIGHT);
+					struct chfs_tmp_dnode_info *next_tdi = (struct chfs_tmp_dnode_info *)next;
+					struct chfs_tmp_dnode *next_td = NULL;
+					if (tmp_td->next) {
+						next_td = tmp_td->next;
+					} else if (next_tdi) {
+						next_td = next_tdi->tmpnode;
+					}
+					if (tmp_td->version < newtd->version) {
+						chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+						chfs_kill_td(chmp, tmp_td);
+						if (!this->tmpnode) {
+							rb_tree_remove_node(&rii->tdi_root, this);
+							chfs_kill_tdi(chmp, this);
+							this = next_tdi;
+						}
+					}
+					tmp_td = next_td;
+				}
+				continue;
+			}
+			if (tmp_td->version > newtd->version &&
+				tmp_td->node->ofs <= newtd->node->ofs &&
+				tmp_td->node->ofs + tmp_td->node->size >= end_ofs) {
+				/* New node entirely overlapped by 'this' */
+				if (!chfs_check_td_node(chmp, tmp_td)) {
+					dbg("this version: %llu\n",
+						(unsigned long long)tmp_td->version);
+					dbg("this ofs: %llu, size: %u\n",
+						(unsigned long long)tmp_td->node->ofs,
+						tmp_td->node->size);
+					dbg("calling kill td 4\n");
+					chfs_kill_td(chmp, newtd);
+					return 0;
+				}
+				/* ... but 'this' was bad. Replace it... */
+				chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+				chfs_kill_td(chmp, tmp_td);
+				if (!this->tmpnode) {
+					rb_tree_remove_node(&rii->tdi_root, this);
+					chfs_kill_tdi(chmp, this);
+				}
+				dbg("calling kill td 5\n");
+				chfs_kill_td(chmp, newtd);
+				break;
+			}
+			tmp_td = tmp_td->next;
+		}
+		node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_RIGHT);
+	}
+
+	newtdi = chfs_alloc_tmp_dnode_info();
+	chfs_add_tmp_dnode_to_tdi(newtdi, newtd);
+	/* We neither completely obsoleted nor were completely
+	   obsoleted by an earlier node. Insert into the tree */
+	struct chfs_tmp_dnode_info *tmp_tdi = rb_tree_insert_node(&rii->tdi_root, newtdi);
+	if (tmp_tdi != newtdi) {
+		chfs_add_tmp_dnode_to_tdi(tmp_tdi, newtd);
+		newtdi->tmpnode = NULL;
+		chfs_kill_tdi(chmp, newtdi);
+	}
+
+	/* If there's anything behind that overlaps us, note it */
+	node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_LEFT);
+	if (node) {
+		while (1) {
+			this = (struct chfs_tmp_dnode_info *)node;
+			if (this->tmpnode->node->ofs + this->tmpnode->node->size > newtd->node->ofs) {
+				newtd->overlapped = 1;
+			}
+			if (!this->tmpnode->overlapped)
+				break;
+
+			prev_node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_LEFT);
+			if (!prev_node) {
+				this->tmpnode->overlapped = 0;
+				break;
+			}
+			node = prev_node;
+		}
+	}
+
+	/* If the new node overlaps anything ahead, note it */
+	node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_RIGHT);
+	this = (struct chfs_tmp_dnode_info *)node;
+	while (this && this->tmpnode->node->ofs < end_ofs) {
+		this->tmpnode->overlapped = 1;
+		node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_RIGHT);
+		this = (struct chfs_tmp_dnode_info *)node;
+	}
+	return 0;
+}
+
+
+/*
+ * --------------------
+ * frag node operations
+ * --------------------
+ */
+struct chfs_node_frag *
+new_fragment(struct chfs_full_dnode *fdn, uint32_t ofs, uint32_t size)
+{
+	struct chfs_node_frag *newfrag;
+	newfrag = chfs_alloc_node_frag();
+	if (newfrag) {
+		newfrag->ofs = ofs;
+		newfrag->size = size;
+		newfrag->node = fdn;
+	} else {
+		chfs_err("cannot allocate a chfs_node_frag object\n");
+	}
+	return newfrag;
+}
+
+int
+no_overlapping_node(struct rb_tree *fragtree,
+    struct chfs_node_frag *newfrag,
+    struct chfs_node_frag *this, uint32_t lastend)
+{
+	if (lastend < newfrag->node->ofs) {
+		struct chfs_node_frag *holefrag;
+
+		holefrag = new_fragment(NULL, lastend, newfrag->node->ofs - lastend);
+		if (!holefrag) {
+			chfs_free_node_frag(newfrag);
+			return ENOMEM;
+		}
+
+		rb_tree_insert_node(fragtree, holefrag);
+		this = holefrag;
+	}
+
+	rb_tree_insert_node(fragtree, newfrag);
+
+	return 0;
+}
+
+int
+chfs_add_frag_to_fragtree(struct chfs_mount *chmp,
+    struct rb_tree *fragtree,
+    struct chfs_node_frag *newfrag)
+{
+	struct chfs_node_frag *this;
+	uint32_t lastend;
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	this = (struct chfs_node_frag *)rb_tree_find_node_leq(fragtree, &newfrag->ofs);
+
+	if (this) {
+		lastend = this->ofs + this->size;
+	} else {
+		lastend = 0;
+	}
+
+	if (lastend <= newfrag->ofs) {
+		//dbg("no overlapping node\n");
+		if (lastend && (lastend - 1) >> PAGE_SHIFT == newfrag->ofs >> PAGE_SHIFT) {
+			if (this->node)
+				CHFS_MARK_REF_NORMAL(this->node->nref);
+			CHFS_MARK_REF_NORMAL(newfrag->node->nref);
+		}
+		return no_overlapping_node(fragtree, newfrag, this, lastend);
+	}
+
+	if (newfrag->ofs > this->ofs) {
+
+		CHFS_MARK_REF_NORMAL(newfrag->node->nref);
+		if (this->node)
+			CHFS_MARK_REF_NORMAL(this->node->nref);
+
+		if (this->ofs + this->size > newfrag->ofs + newfrag->size) {
+			/* newfrag is inside of this */
+			//dbg("newfrag is inside of this\n");
+			struct chfs_node_frag *newfrag2;
+
+			newfrag2 = new_fragment(this->node, newfrag->ofs + newfrag->size,
+			    this->ofs + this->size - newfrag->ofs - newfrag->size);
+			if (!newfrag2)
+				return ENOMEM;
+			if (this->node)
+				this->node->frags++;
+
+			this->size = newfrag->ofs - this->ofs;
+
+			rb_tree_insert_node(fragtree, newfrag);
+			rb_tree_insert_node(fragtree, newfrag2);
+
+			return 0;
+		}
+		/* newfrag is bottom of this */
+		//dbg("newfrag is bottom of this\n");
+		this->size = newfrag->ofs - this->ofs;
+		rb_tree_insert_node(fragtree, newfrag);
+	} else {
+		/* newfrag start at same point */
+		//dbg("newfrag start at same point\n");
+		//TODO replace instead of remove and insert
+		rb_tree_remove_node(fragtree, this);
+		rb_tree_insert_node(fragtree, newfrag);
+
+		if (newfrag->ofs + newfrag->size >= this->ofs+this->size) {
+			chfs_obsolete_node_frag(chmp, this);
+		} else {
+			this->ofs += newfrag->size;
+			this->size -= newfrag->size;
+
+			rb_tree_insert_node(fragtree, this);
+			return 0;
+		}
+	}
+	/* OK, now we have newfrag added in the correct place in the tree, but
+	   frag_next(newfrag) may be a fragment which is overlapped by it
+	*/
+	while ((this = frag_next(fragtree, newfrag)) && newfrag->ofs + newfrag->size >= this->ofs + this->size) {
+		rb_tree_remove_node(fragtree, this);
+		chfs_obsolete_node_frag(chmp, this);
+	}
+
+	if (!this || newfrag->ofs + newfrag->size == this->ofs)
+		return 0;
+
+	this->size = (this->ofs + this->size) - (newfrag->ofs + newfrag->size);
+	this->ofs = newfrag->ofs + newfrag->size;
+
+	if (this->node)
+		CHFS_MARK_REF_NORMAL(this->node->nref);
+	CHFS_MARK_REF_NORMAL(newfrag->node->nref);
+
+	return 0;
+}
+
+void
+chfs_kill_fragtree(struct rb_tree *fragtree)
+{
+	struct chfs_node_frag *this, *next;
+	//dbg("start\n");
+
+	this = (struct chfs_node_frag *)RB_TREE_MIN(fragtree);
+	while (this) {
+		//for (this = (struct chfs_node_frag *)RB_TREE_MIN(&fragtree); this != NULL; this = (struct chfs_node_frag *)rb_tree_iterate(&fragtree, &this->rb_node, RB_DIR_RIGHT)) {
+		next = frag_next(fragtree, this);
+		rb_tree_remove_node(fragtree, this);
+		chfs_free_node_frag(this);
+		//dbg("one frag killed\n");
+		this = next;
+	}
+	//dbg("end\n");
+}
+
+uint32_t
+chfs_truncate_fragtree(struct chfs_mount *chmp,
+	struct rb_tree *fragtree, uint32_t size)
+{
+	struct chfs_node_frag *frag;
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	dbg("truncate to size: %u\n", size);
+
+	frag = (struct chfs_node_frag *)rb_tree_find_node_leq(fragtree, &size);
+
+	/* Find the last frag before size and set its new size. */
+	if (frag && frag->ofs != size) {
+		if (frag->ofs + frag->size > size) {
+			frag->size = size - frag->ofs;
+		}
+		frag = frag_next(fragtree, frag);
+	}
+
+	/* Delete frags after new size. */
+	while (frag && frag->ofs >= size) {
+		struct chfs_node_frag *next = frag_next(fragtree, frag);
+
+		rb_tree_remove_node(fragtree, frag);
+		chfs_obsolete_node_frag(chmp, frag);
+		frag = next;
+	}
+
+	if (size == 0) {
+		return 0;
+	}
+
+	frag = frag_last(fragtree);
+
+	if (!frag) {
+		return 0;
+	}
+	
+	if (frag->ofs + frag->size < size) {
+		return frag->ofs + frag->size;
+	}
+
+	/* FIXME Should we check the postion of the last node? (PAGE_CACHE size, etc.) */
+	if (frag->node && (frag->ofs & (PAGE_SIZE - 1)) == 0) {
+		frag->node->nref->nref_offset = CHFS_GET_OFS(frag->node->nref->nref_offset) | CHFS_PRISTINE_NODE_MASK;
+	}
+
+	return size;
+}
+
+void
+chfs_obsolete_node_frag(struct chfs_mount *chmp,
+    struct chfs_node_frag *this)
+{
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+	if (this->node) {
+		this->node->frags--;
+		if (!this->node->frags) {
+			struct chfs_vnode_cache *vc = chfs_nref_to_vc(this->node->nref);
+			chfs_mark_node_obsolete(chmp, this->node->nref);
+			
+			if (vc->dnode == this->node->nref) {
+				vc->dnode = this->node->nref->nref_next;
+			} else {
+				struct chfs_node_ref *tmp = vc->dnode;
+				while (tmp->nref_next != (struct chfs_node_ref*) vc 
+						&& tmp->nref_next != this->node->nref) {
+					tmp = tmp->nref_next;
+				}
+				if (tmp->nref_next == this->node->nref) {
+					tmp->nref_next = this->node->nref->nref_next;
+				}
+				// FIXME should we free here the this->node->nref?
+			}
+			
+			chfs_free_full_dnode(this->node);
+		} else {
+			CHFS_MARK_REF_NORMAL(this->node->nref);
+		}
+	}
+	chfs_free_node_frag(this);
+}
+
+int
+chfs_add_full_dnode_to_inode(struct chfs_mount *chmp,
+    struct chfs_inode *ip,
+    struct chfs_full_dnode *fd)
+{
+	int ret;
+	struct chfs_node_frag *newfrag;
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	if (unlikely(!fd->size))
+		return 0;
+
+	newfrag = new_fragment(fd, fd->ofs, fd->size);
+	if (unlikely(!newfrag))
+		return ENOMEM;
+
+	newfrag->node->frags = 1;
+
+	ret = chfs_add_frag_to_fragtree(chmp, &ip->fragtree, newfrag);
+	if (ret)
+		return ret;
+
+	if (newfrag->ofs & (PAGE_SIZE - 1)) {
+		struct chfs_node_frag *prev = frag_prev(&ip->fragtree, newfrag);
+
+		CHFS_MARK_REF_NORMAL(fd->nref);
+		if (prev->node)
+			CHFS_MARK_REF_NORMAL(prev->node->nref);
+	}
+
+	if ((newfrag->ofs+newfrag->size) & (PAGE_SIZE - 1)) {
+		struct chfs_node_frag *next = frag_next(&ip->fragtree, newfrag);
+
+		if (next) {
+			CHFS_MARK_REF_NORMAL(fd->nref);
+			if (next->node)
+				CHFS_MARK_REF_NORMAL(next->node->nref);
+		}
+	}
+
+	return 0;
+}
+
+
+/*
+ * -----------------------
+ * general node operations
+ * -----------------------
+ */
+/* get tmp nodes of an inode */
+int
+chfs_get_data_nodes(struct chfs_mount *chmp,
+    struct chfs_inode *ip,
+    struct chfs_readinode_info *rii)
+{
+	uint32_t crc;
+	int err;
+	size_t len, retlen;
+	struct chfs_node_ref *nref;
+	struct chfs_flash_data_node *dnode;
+	struct chfs_tmp_dnode *td;
+	char* buf;
+
+	len = sizeof(struct chfs_flash_data_node);
+	buf = kmem_alloc(len, KM_SLEEP);
+
+	dnode = kmem_alloc(len, KM_SLEEP);
+	if (!dnode)
+		return ENOMEM;
+
+	nref = chfs_first_valid_data_ref(ip->chvc->dnode);
+
+	rii->highest_version = ip->chvc->highest_version;
+
+	while(nref && (struct chfs_vnode_cache *)nref != ip->chvc) {
+		err = chfs_read_leb(chmp, nref->nref_lnr, buf, CHFS_GET_OFS(nref->nref_offset), len, &retlen);
+		if (err || len != retlen)
+			goto out;
+		dnode = (struct chfs_flash_data_node*)buf;
+
+		//check header crc
+		crc = crc32(0, (uint8_t *)dnode, CHFS_NODE_HDR_SIZE - 4);
+		if (crc != le32toh(dnode->hdr_crc)) {
+			chfs_err("CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->hdr_crc));
+			goto cont;
+		}
+		//check header magic bitmask
+		if (le16toh(dnode->magic) != CHFS_FS_MAGIC_BITMASK) {
+			chfs_err("Wrong magic bitmask.\n");
+			goto cont;
+		}
+		//check node crc
+		crc = crc32(0, (uint8_t *)dnode, sizeof(*dnode) - 4);
+		if (crc != le32toh(dnode->node_crc)) {
+			chfs_err("Node CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->node_crc));
+			goto cont;
+		}
+		td = chfs_alloc_tmp_dnode();
+		if (!td) {
+			chfs_err("Can't allocate tmp dnode info.\n");
+			err = ENOMEM;
+			goto out;
+		}
+		/* We don't check data crc here, just add nodes to tmp frag tree, because
+		 * we don't want to check nodes which have been overlapped by a new node
+		 * with a higher version number.
+		 */
+		td->node = chfs_alloc_full_dnode();
+		if (!td->node) {
+			chfs_err("Can't allocate full dnode info.\n");
+			err = ENOMEM;
+			goto out_tmp_dnode;
+		}
+		td->version = le64toh(dnode->version);
+		td->node->ofs = le64toh(dnode->offset);
+		td->data_crc = le32toh(dnode->data_crc);
+		td->node->nref = nref;
+		td->node->size = le32toh(dnode->data_length);
+		td->overlapped = 0;
+
+		if (td->version > rii->highest_version) {
+			rii->highest_version = td->version;
+		}
+
+		err = chfs_add_tmp_dnode_to_tree(chmp, rii, td);
+		if (err)
+			goto out_full_dnode;
+
+cont:
+		nref = chfs_first_valid_data_ref(nref->nref_next);
+	}
+
+	ip->chvc->highest_version = rii->highest_version;
+	return 0;
+
+/* Exit points */
+out_full_dnode:
+	chfs_free_full_dnode(td->node);
+out_tmp_dnode:
+	chfs_free_tmp_dnode(td);
+out:
+	kmem_free(buf, len);
+	kmem_free(dnode, len);
+	return err;
+}
+
+
+/* Build final normal fragtree from tdi tree. */
+int
+chfs_build_fragtree(struct chfs_mount *chmp, struct chfs_inode *ip,
+    struct chfs_readinode_info *rii)
+{
+	struct chfs_tmp_dnode_info *pen, *last, *this;
+	struct rb_tree ver_tree;    /* version tree */
+	uint64_t high_ver = 0;
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	rb_tree_init(&ver_tree, &tmp_node_rbtree_ops);
+
+	if (rii->mdata_tn) {
+		high_ver = rii->mdata_tn->tmpnode->version;
+		rii->latest_ref = rii->mdata_tn->tmpnode->node->nref;
+	}
+
+	pen = (struct chfs_tmp_dnode_info *)RB_TREE_MAX(&rii->tdi_root);
+
+	while((last = pen)) {
+		pen = (struct chfs_tmp_dnode_info *)rb_tree_iterate(&rii->tdi_root, last, RB_DIR_LEFT);
+
+		rb_tree_remove_node(&rii->tdi_root, last);
+		rb_tree_insert_node(&ver_tree, last);
+
+		if (last->tmpnode->overlapped) {
+			if (pen)
+				continue;
+
+			last->tmpnode->overlapped = 0;
+		}
+		
+		this = (struct chfs_tmp_dnode_info *)RB_TREE_MAX(&ver_tree);
+
+		while (this) {
+			struct chfs_tmp_dnode_info *vers_next;
+			int ret;
+
+			vers_next = (struct chfs_tmp_dnode_info *)rb_tree_iterate(&ver_tree, this, RB_DIR_LEFT);
+			rb_tree_remove_node(&ver_tree, this);
+
+			struct chfs_tmp_dnode *tmp_td = this->tmpnode;
+			while (tmp_td) {
+				struct chfs_tmp_dnode *next_td = tmp_td->next;
+				
+				if (chfs_check_td_node(chmp, tmp_td)) {
+					if (next_td) {
+						chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+					} else {
+						break;
+					}
+				} else {
+					if (tmp_td->version > high_ver) {
+						high_ver = tmp_td->version;
+						dbg("highver: %llu\n", (unsigned long long)high_ver);
+						rii->latest_ref = tmp_td->node->nref;
+					}
+
+					ret = chfs_add_full_dnode_to_inode(chmp, ip, tmp_td->node);
+					if (ret) {
+						while (1) {
+							vers_next = (struct chfs_tmp_dnode_info *)rb_tree_iterate(&ver_tree, this, RB_DIR_LEFT);
+							while (tmp_td) {
+								next_td = tmp_td->next;
+								if (chfs_check_td_node(chmp, tmp_td) > 1) {
+									chfs_mark_node_obsolete(chmp,
+										tmp_td->node->nref);
+								}
+								chfs_free_full_dnode(tmp_td->node);
+								chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+								chfs_free_tmp_dnode(tmp_td);
+								tmp_td = next_td;
+							}
+							chfs_free_tmp_dnode_info(this);
+							this = vers_next;
+							if (!this)
+								break;
+							rb_tree_remove_node(&ver_tree, vers_next);
+						}
+						return ret;
+					}
+
+					chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+					chfs_free_tmp_dnode(tmp_td);
+				}
+				tmp_td = next_td;
+			}
+			chfs_kill_tdi(chmp, this);
+			this = vers_next;
+		}
+	}
+
+	return 0;
+}
+
+int chfs_read_inode(struct chfs_mount *chmp, struct chfs_inode *ip)
+{
+	struct chfs_vnode_cache *vc = ip->chvc;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+retry:
+	/* XXX locking */
+	//mutex_enter(&chmp->chm_lock_vnocache);
+	switch (vc->state) {
+	case VNO_STATE_UNCHECKED:
+	case VNO_STATE_CHECKEDABSENT:
+//		chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_READING);
+		vc->state = VNO_STATE_READING;
+		break;
+	case VNO_STATE_CHECKING:
+	case VNO_STATE_GC:
+		//sleep_on_spinunlock(&chmp->chm_lock_vnocache);
+		//KASSERT(!mutex_owned(&chmp->chm_lock_vnocache));
+		goto retry;
+		break;
+	case VNO_STATE_PRESENT:
+	case VNO_STATE_READING:
+		chfs_err("Reading inode #%llu in state %d!\n",
+			(unsigned long long)vc->vno, vc->state);
+		chfs_err("wants to read a nonexistent ino %llu\n",
+			(unsigned long long)vc->vno);
+		return ENOENT;
+	default:
+		panic("BUG() Bad vno cache state.");
+	}
+	//mutex_exit(&chmp->chm_lock_vnocache);
+
+	return chfs_read_inode_internal(chmp, ip);
+}
+
+/*
+ * Read inode frags.
+ * Firstly get tmp nodes,
+ * secondly build fragtree from those.
+ */
+int
+chfs_read_inode_internal(struct chfs_mount *chmp, struct chfs_inode *ip)
+{
+	int err;
+	size_t len, retlen;
+	char* buf;
+	struct chfs_readinode_info rii;
+	struct chfs_flash_vnode *fvnode;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	len = sizeof(*fvnode);
+
+	memset(&rii, 0, sizeof(rii));
+
+	rb_tree_init(&rii.tdi_root, &tmp_node_rbtree_ops);
+
+	/* build up a temp node frag tree */
+	err = chfs_get_data_nodes(chmp, ip, &rii);
+	if (err) {
+		if (ip->chvc->state == VNO_STATE_READING)
+			ip->chvc->state = VNO_STATE_CHECKEDABSENT;
+		/* FIXME Should we kill fragtree or something here? */
+		return err;
+	}
+
+	rb_tree_init(&ip->fragtree, &frag_rbtree_ops);
+	/*
+	 * build fragtree from temp nodes
+	 */
+	err = chfs_build_fragtree(chmp, ip, &rii);
+	if (err) {
+		if (ip->chvc->state == VNO_STATE_READING)
+			ip->chvc->state = VNO_STATE_CHECKEDABSENT;
+		/* FIXME Should we kill fragtree or something here? */
+		return err;
+	}
+
+	if (!rii.latest_ref) {
+		return 0;
+	}
+
+	buf = kmem_alloc(len, KM_SLEEP);
+	if (!buf)
+		return ENOMEM;
+
+	/*
+	 * set inode size from chvc->v
+	 */
+	err = chfs_read_leb(chmp, ip->chvc->v->nref_lnr, buf, CHFS_GET_OFS(ip->chvc->v->nref_offset), len, &retlen);
+	if (err || retlen != len) {
+		kmem_free(buf, len);
+		return err?err:EIO;
+	}
+
+	fvnode = (struct chfs_flash_vnode*)buf;
+
+	dbg("set size from v: %u\n", fvnode->dn_size);
+	chfs_set_vnode_size(ITOV(ip), fvnode->dn_size);
+	uint32_t retsize = chfs_truncate_fragtree(chmp, &ip->fragtree, fvnode->dn_size);
+	if (retsize != fvnode->dn_size) {
+		dbg("Truncating failed. It is %u instead of %u\n", retsize, fvnode->dn_size);
+	}
+
+	kmem_free(buf, len);
+
+	if (ip->chvc->state == VNO_STATE_READING) {
+		ip->chvc->state = VNO_STATE_PRESENT;
+	}
+
+	return 0;
+}
+
+int
+chfs_read_data(struct chfs_mount* chmp, struct vnode *vp,
+    struct buf *bp)
+{
+	off_t ofs;
+	struct chfs_node_frag *frag;
+	char * buf;
+	int err = 0;
+	size_t size, retlen;
+	uint32_t crc;
+	struct chfs_inode *ip = VTOI(vp);
+	struct chfs_flash_data_node *dnode;
+	struct chfs_node_ref *nref;
+
+	memset(bp->b_data, 0, bp->b_bcount);
+
+	ofs = bp->b_blkno * PAGE_SIZE;
+	frag = (struct chfs_node_frag *)rb_tree_find_node_leq(&ip->fragtree, &ofs);
+
+	if (!frag || frag->ofs > ofs || frag->ofs + frag->size <= ofs) {
+		dbg("not found in frag tree\n");
+		return 0;
+	}
+
+	if (!frag->node) {
+		dbg("no node in frag\n");
+		return 0;
+	}
+
+	nref = frag->node->nref;
+
+	size = sizeof(*dnode) + frag->size;
+
+	buf = kmem_alloc(size, KM_SLEEP);
+
+	dbg("reading from lnr: %u, offset: %u, size: %zu\n", nref->nref_lnr, CHFS_GET_OFS(nref->nref_offset), size);
+	err = chfs_read_leb(chmp, nref->nref_lnr, buf, CHFS_GET_OFS(nref->nref_offset), size, &retlen);
+	if (err) {
+		chfs_err("error after reading: %d\n", err);
+		goto out;
+	}
+	if (retlen != size) {
+		chfs_err("retlen: %zu != size: %zu\n", retlen, size);
+		err = EIO;
+		goto out;
+	}
+
+	dnode = (struct chfs_flash_data_node *)buf;
+	crc = crc32(0, (uint8_t *)dnode, CHFS_NODE_HDR_SIZE - 4);
+	if (crc != le32toh(dnode->hdr_crc)) {
+		chfs_err("CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->hdr_crc));
+		err = EIO;
+		goto out;
+	}
+	//check header magic bitmask
+	if (le16toh(dnode->magic) != CHFS_FS_MAGIC_BITMASK) {
+		chfs_err("Wrong magic bitmask.\n");
+		err = EIO;
+		goto out;
+	}
+	//check node crc
+	crc = crc32(0, (uint8_t *)dnode, sizeof(*dnode) - 4);
+	if (crc != le32toh(dnode->node_crc)) {
+		chfs_err("Node CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->node_crc));
+		err = EIO;
+		goto out;
+	}
+	crc = crc32(0, (uint8_t *)dnode->data, dnode->data_length);
+	if (crc != le32toh(dnode->data_crc)) {
+		chfs_err("Data CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->data_crc));
+		err = EIO;
+		goto out;
+	}
+
+	memcpy(bp->b_data, dnode->data, dnode->data_length);
+	bp->b_resid = 0;
+
+out:
+	kmem_free(buf, size);
+	return err;
+}
diff --git a/sys/ufs/chfs/chfs_scan.c b/sys/ufs/chfs/chfs_scan.c
new file mode 100644
index 000000000..a35ce7215
--- /dev/null
+++ b/sys/ufs/chfs/chfs_scan.c
@@ -0,0 +1,740 @@
+/*	$NetBSD: chfs_scan.c,v 1.2 2011/11/24 21:09:37 agc Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * Copyright (c) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * chfs_scan.c
+ *
+ *  Created on: 2009.11.05.
+ *      Author: dtengeri
+ */
+
+#include "chfs.h"
+
+/**
+ * chfs_scan_make_vnode_cache - makes a new vnode cache during scan
+ * @chmp: CHFS main descriptor structure
+ * @vno: vnode identifier
+ * This function returns a vnode cache belonging to @vno.
+ */
+struct chfs_vnode_cache *
+chfs_scan_make_vnode_cache(struct chfs_mount *chmp, ino_t vno)
+{
+	struct chfs_vnode_cache *vc;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+
+	vc = chfs_vnode_cache_get(chmp, vno);
+	if (vc) {
+		return vc;
+	}
+
+	if (vno > chmp->chm_max_vno) {
+		chmp->chm_max_vno = vno;
+	}
+
+	vc = chfs_vnode_cache_alloc(vno);
+
+	//mutex_enter(&chmp->chm_lock_vnocache);
+
+	chfs_vnode_cache_add(chmp, vc);
+
+	//mutex_exit(&chmp->chm_lock_vnocache);
+
+	if (vno == CHFS_ROOTINO) {
+		vc->nlink = 2;
+		vc->pvno = CHFS_ROOTINO;
+		chfs_vnode_cache_set_state(chmp,
+		    vc, VNO_STATE_CHECKEDABSENT);
+	}
+
+	return vc;
+}
+
+/**
+ * chfs_scan_check_node_hdr - checks node magic and crc
+ * @nhdr: node header to check
+ * Returns 0 if everything is OK, error code otherwise.
+ */
+int
+chfs_scan_check_node_hdr(struct chfs_flash_node_hdr *nhdr)
+{
+	uint16_t magic;
+	uint32_t crc, hdr_crc;
+
+	magic = le16toh(nhdr->magic);
+
+	if (magic != CHFS_FS_MAGIC_BITMASK) {
+		dbg("bad magic\n");
+		return CHFS_NODE_BADMAGIC;
+	}
+
+	hdr_crc = le32toh(nhdr->hdr_crc);
+	crc = crc32(0, (uint8_t *)nhdr, CHFS_NODE_HDR_SIZE - 4);
+
+	if (crc != hdr_crc) {
+		dbg("bad crc\n");
+		return CHFS_NODE_BADCRC;
+	}
+
+	return CHFS_NODE_OK;
+}
+
+/**
+ * chfs_scan_check_vnode - check vnode crc and add to vnode cache
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock informations
+ * @buf: vnode to check
+ * @ofs: offset in eraseblock where vnode starts
+ */
+int
+chfs_scan_check_vnode(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, void *buf, off_t ofs)
+{
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+	struct chfs_vnode_cache *vc;
+	struct chfs_flash_vnode *vnode = buf;
+	struct chfs_node_ref *nref;
+	int err;
+	uint32_t crc;
+	ino_t vno;
+
+	crc = crc32(0, (uint8_t *)vnode,
+	    sizeof(struct chfs_flash_vnode) - 4);
+
+	if (crc != le32toh(vnode->node_crc)) {
+		err = chfs_update_eb_dirty(chmp,
+		    cheb, le32toh(vnode->length));
+		if (err) {
+			return err;
+		}
+
+		return CHFS_NODE_BADCRC;
+	}
+
+	vno = le64toh(vnode->vno);
+
+	mutex_enter(&chmp->chm_lock_vnocache);
+	vc = chfs_vnode_cache_get(chmp, vno);
+	if (!vc) {
+		vc = chfs_scan_make_vnode_cache(chmp, vno);
+		if (!vc) {
+			mutex_exit(&chmp->chm_lock_vnocache);
+			return ENOMEM;
+		}
+	}
+	mutex_exit(&chmp->chm_lock_vnocache);
+
+	nref = chfs_alloc_node_ref(cheb);
+
+	nref->nref_offset = ofs;
+
+	KASSERT(nref->nref_lnr == cheb->lnr);
+
+	/* Check version of vnode. */
+	if ((struct chfs_vnode_cache *)vc->v != vc) {
+		if (le64toh(vnode->version) > *vc->vno_version) {
+			//err = chfs_update_eb_dirty(chmp, &chmp->chm_blocks[vc->v->lnr],
+			//		sizeof(struct chfs_flash_vnode));
+			*vc->vno_version = le64toh(vnode->version);
+			chfs_add_vnode_ref_to_vc(chmp, vc, nref);
+		} else {
+			err = chfs_update_eb_dirty(chmp, cheb,
+			    sizeof(struct chfs_flash_vnode));
+			return CHFS_NODE_OK;
+		}
+	} else {
+		vc->vno_version = kmem_alloc(sizeof(uint64_t), KM_SLEEP);
+		if (!vc->vno_version)
+			return ENOMEM;
+		*vc->vno_version = le64toh(vnode->version);
+		chfs_add_vnode_ref_to_vc(chmp, vc, nref);
+	}
+
+	mutex_enter(&chmp->chm_lock_sizes);
+	//dbg("B:lnr: %d |free_size: %d node's size: %d\n", cheb->lnr, cheb->free_size, le32toh(vnode->length));
+	chfs_change_size_free(chmp, cheb, -le32toh(vnode->length));
+	chfs_change_size_used(chmp, cheb, le32toh(vnode->length));
+	mutex_exit(&chmp->chm_lock_sizes);
+
+	KASSERT(cheb->used_size <= chmp->chm_ebh->eb_size);
+
+	KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size);
+
+	//dbg(" A: free_size: %d\n", cheb->free_size);
+
+	/*dbg("vnode dump:\n");
+	  dbg(" ->magic:    0x%x\n", le16toh(vnode->magic));
+	  dbg(" ->type:     %d\n", le16toh(vnode->type));
+	  dbg(" ->length:   %d\n", le32toh(vnode->length));
+	  dbg(" ->hdr_crc:  0x%x\n", le32toh(vnode->hdr_crc));
+	  dbg(" ->vno:      %d\n", le64toh(vnode->vno));
+	  dbg(" ->version:  %ld\n", le64toh(vnode->version));
+	  dbg(" ->uid:      %d\n", le16toh(vnode->uid));
+	  dbg(" ->gid:      %d\n", le16toh(vnode->gid));
+	  dbg(" ->mode:     %d\n", le32toh(vnode->mode));
+	  dbg(" ->dn_size:  %d\n", le32toh(vnode->dn_size));
+	  dbg(" ->atime:    %d\n", le32toh(vnode->atime));
+	  dbg(" ->mtime:    %d\n", le32toh(vnode->mtime));
+	  dbg(" ->ctime:    %d\n", le32toh(vnode->ctime));
+	  dbg(" ->dsize:    %d\n", le32toh(vnode->dsize));
+	  dbg(" ->node_crc: 0x%x\n", le32toh(vnode->node_crc));*/
+
+	return CHFS_NODE_OK;
+}
+
+int
+chfs_scan_mark_dirent_obsolete(struct chfs_mount *chmp,
+    struct chfs_vnode_cache *vc, struct chfs_dirent *fd)
+{
+	//int size;
+	struct chfs_eraseblock *cheb;
+	struct chfs_node_ref *prev, *nref;
+
+	nref = fd->nref;
+	cheb = &chmp->chm_blocks[fd->nref->nref_lnr];
+
+	/* Remove dirent's node ref from vnode cache */
+	prev = vc->dirents;
+	if (prev && prev == nref) {
+		vc->dirents = prev->nref_next;
+	} else if (prev && prev != (void *)vc) {
+		while (prev->nref_next && prev->nref_next !=
+		    (void *)vc && prev->nref_next != nref) {
+			prev = prev->nref_next;
+		}
+
+		if (prev->nref_next == nref) {
+			prev->nref_next = nref->nref_next;
+		}
+	}
+	/*dbg("XXX - start\n");
+	//nref = vc->dirents;
+	struct chfs_dirent *tmp;
+	tmp = vc->scan_dirents;
+	while (tmp) {
+	dbg(" ->tmp->name:    %s\n", tmp->name);
+	dbg(" ->tmp->version: %ld\n", tmp->version);
+	dbg(" ->tmp->vno: %d\n", tmp->vno);
+	tmp = tmp->next;
+	}
+	dbg("XXX - end\n");*/
+	//size = CHFS_PAD(sizeof(struct chfs_flash_dirent_node) + fd->nsize);
+
+	KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size +
+	    cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size);
+
+	return 0;
+}
+
+void
+chfs_add_fd_to_list(struct chfs_mount *chmp,
+    struct chfs_dirent *new, struct chfs_vnode_cache *pvc)
+{
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+	int size;
+	struct chfs_eraseblock *cheb, *oldcheb;
+//	struct chfs_dirent **prev;
+	struct chfs_dirent *fd, *tmpfd;
+
+	dbg("adding fd to list: %s\n", new->name);
+
+	if ((new->version > pvc->highest_version))
+		pvc->highest_version = new->version;
+
+	size = CHFS_PAD(sizeof(struct chfs_flash_dirent_node) +
+	    new->nsize);
+	cheb = &chmp->chm_blocks[new->nref->nref_lnr];
+
+	mutex_enter(&chmp->chm_lock_sizes);	
+	TAILQ_FOREACH_SAFE(fd, &pvc->scan_dirents, fds, tmpfd) {
+		if (fd->nhash > new->nhash) {
+			/* insert new before fd */
+			TAILQ_INSERT_BEFORE(fd, new, fds);
+			goto out;
+		} else if (fd->nhash == new->nhash &&
+		    !strcmp(fd->name, new->name)) {
+			if (new->version > fd->version) {
+//				new->next = fd->next;
+				/* replace fd with new */
+				TAILQ_INSERT_BEFORE(fd, new, fds);
+				chfs_change_size_free(chmp, cheb, -size);
+				chfs_change_size_used(chmp, cheb, size);
+
+				TAILQ_REMOVE(&pvc->scan_dirents, fd, fds);
+				if (fd->nref) {
+					size = CHFS_PAD(sizeof(struct chfs_flash_dirent_node) + fd->nsize);
+					chfs_scan_mark_dirent_obsolete(chmp, pvc, fd);
+					oldcheb = &chmp->chm_blocks[fd->nref->nref_lnr];
+					chfs_change_size_used(chmp, oldcheb, -size);
+					chfs_change_size_dirty(chmp, oldcheb, size);
+				}
+				chfs_free_dirent(fd);
+//				*prev = new;//XXX
+			} else {
+				chfs_scan_mark_dirent_obsolete(chmp, pvc, new);
+				chfs_change_size_free(chmp, cheb, -size);
+				chfs_change_size_dirty(chmp, cheb, size);
+				chfs_free_dirent(new);
+			}
+			/*dbg("START\n");
+			  fd = pvc->scan_dirents;
+			  while (fd) {
+			  dbg("dirent dump:\n");
+			  dbg(" ->vno:     %d\n", fd->vno);
+			  dbg(" ->version: %ld\n", fd->version);
+			  dbg(" ->nhash:   0x%x\n", fd->nhash);
+			  dbg(" ->nsize:   %d\n", fd->nsize);
+			  dbg(" ->name:    %s\n", fd->name);
+			  dbg(" ->type:    %d\n", fd->type);
+			  fd = fd->next;
+			  }
+			  dbg("END\n");*/
+			mutex_exit(&chmp->chm_lock_sizes);
+			return;
+		}
+	}
+	/* if we couldnt fit it elsewhere, lets add to the end */
+	TAILQ_INSERT_TAIL(&pvc->scan_dirents, new, fds);
+
+out:
+	//dbg("B:lnr: %d |free_size: %d size: %d\n", cheb->lnr, cheb->free_size, size);
+	chfs_change_size_free(chmp, cheb, -size);
+	chfs_change_size_used(chmp, cheb, size);
+	mutex_exit(&chmp->chm_lock_sizes);
+
+	KASSERT(cheb->used_size <= chmp->chm_ebh->eb_size);
+	//dbg(" A: free_size: %d\n", cheb->free_size);
+
+	KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size);
+
+
+//	fd = pvc->scan_dirents;
+	/*dbg("START\n");
+	  while (fd) {
+	  dbg("dirent dump:\n");
+	  dbg(" ->vno:     %d\n", fd->vno);
+	  dbg(" ->version: %ld\n", fd->version);
+	  dbg(" ->nhash:   0x%x\n", fd->nhash);
+	  dbg(" ->nsize:   %d\n", fd->nsize);
+	  dbg(" ->name:    %s\n", fd->name);
+	  dbg(" ->type:    %d\n", fd->type);
+	  fd = fd->next;
+	  }
+	  dbg("END\n");*/
+}
+/**
+ * chfs_scan_check_dirent_node - check vnode crc and add to vnode cache
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock informations
+ * @buf: directory entry to check
+ * @ofs: offset in eraseblock where dirent starts
+ */
+int
+chfs_scan_check_dirent_node(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, void *buf, off_t ofs)
+{
+	int err, namelen;
+	uint32_t crc;
+	struct chfs_dirent *fd;
+	struct chfs_vnode_cache *vc;
+	struct chfs_flash_dirent_node *dirent = buf;
+
+	//struct chfs_node_ref *tmp;
+
+	crc = crc32(0, (uint8_t *)dirent, sizeof(*dirent) - 4);
+	if (crc != le32toh(dirent->node_crc)) {
+		err = chfs_update_eb_dirty(chmp, cheb, le32toh(dirent->length));
+		if (err)
+			return err;
+		return CHFS_NODE_BADCRC;
+	}
+	namelen = dirent->nsize;
+
+	fd = chfs_alloc_dirent(namelen + 1);
+	if (!fd)
+		return ENOMEM;
+
+	fd->nref = chfs_alloc_node_ref(cheb);
+	if (!fd->nref)
+		return ENOMEM;
+
+	KASSERT(fd->nref->nref_lnr == cheb->lnr);
+
+	memcpy(&fd->name, dirent->name, namelen);
+	fd->nsize = namelen;
+	fd->name[namelen] = 0;
+	crc = crc32(0, fd->name, dirent->nsize);
+	if (crc != le32toh(dirent->name_crc)) {
+		chfs_err("Directory entry's name has bad crc: read: 0x%x, "
+		    "calculated: 0x%x\n", le32toh(dirent->name_crc), crc);
+		chfs_free_dirent(fd);
+		err = chfs_update_eb_dirty(chmp, cheb, le32toh(dirent->length));
+		if (err)
+			return err;
+		return CHFS_NODE_BADNAMECRC;
+	}
+
+	/* Check vnode_cache of parent node */
+	mutex_enter(&chmp->chm_lock_vnocache);
+	vc = chfs_scan_make_vnode_cache(chmp, le64toh(dirent->pvno));
+	mutex_exit(&chmp->chm_lock_vnocache);
+	if (!vc) {
+		chfs_free_dirent(fd);
+		return ENOMEM;
+	}
+
+	fd->nref->nref_offset = ofs;
+
+	dbg("add dirent to #%llu\n", (unsigned long long)vc->vno);
+	chfs_add_node_to_list(chmp, vc, fd->nref, &vc->dirents);
+	/*tmp = vc->dirents;
+	  dbg("START|vno: %d dirents dump\n", vc->vno);
+	  while (tmp) {
+	  dbg(" ->nref->nref_lnr:    %d\n", tmp->lnr);
+	  dbg(" ->nref->nref_offset: %d\n", tmp->offset);
+	  tmp = tmp->next;
+	  }
+	  dbg("  END|vno: %d dirents dump\n", vc->vno);*/
+
+//	fd->next = NULL;
+	fd->vno = le64toh(dirent->vno);
+	fd->version = le64toh(dirent->version);
+	fd->nhash = hash32_buf(fd->name, namelen, HASH32_BUF_INIT);
+	fd->type = dirent->dtype;
+
+	/*dbg("dirent dump:\n");
+	  dbg(" ->vno:     %d\n", fd->vno);
+	  dbg(" ->version: %ld\n", fd->version);
+	  dbg(" ->nhash:   0x%x\n", fd->nhash);
+	  dbg(" ->nsize:   %d\n", fd->nsize);
+	  dbg(" ->name:    %s\n", fd->name);
+	  dbg(" ->type:    %d\n", fd->type);*/
+
+	chfs_add_fd_to_list(chmp, fd, vc);
+
+	/*struct chfs_node_ref *tmp;
+	  tmp = vc->dirents;
+	  dbg("START|vno: %d dirents dump\n", vc->vno);
+	  while (tmp) {
+	  dbg(" ->nref->nref_lnr:    %d\n", tmp->lnr);
+	  dbg(" ->nref->nref_offset: %d\n", tmp->offset);
+	  tmp = tmp->next;
+	  }
+	  dbg("  END|vno: %d dirents dump\n", vc->vno);*/
+
+	/*dbg("dirent dump:\n");
+	  dbg(" ->magic:    0x%x\n", le16toh(dirent->magic));
+	  dbg(" ->type:     %d\n", le16toh(dirent->type));
+	  dbg(" ->length:   %d\n", le32toh(dirent->length));
+	  dbg(" ->hdr_crc:  0x%x\n", le32toh(dirent->hdr_crc));
+	  dbg(" ->vno:      %d\n", le64toh(dirent->vno));
+	  dbg(" ->pvno:     %d\n", le64toh(dirent->pvno));
+	  dbg(" ->version:  %ld\n", le64toh(dirent->version));
+	  dbg(" ->mctime:   %d\n", le32toh(dirent->mctime));
+	  dbg(" ->nsize:    %d\n", dirent->nsize);
+	  dbg(" ->dtype:    %d\n", dirent->dtype);
+	  dbg(" ->name_crc: 0x%x\n", le32toh(dirent->name_crc));
+	  dbg(" ->node_crc: 0x%x\n", le32toh(dirent->node_crc));
+	  dbg(" ->name:     %s\n", dirent->name);*/
+
+	return CHFS_NODE_OK;
+}
+
+/**
+ * chfs_scan_check_data_node - check vnode crc and add to vnode cache
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock informations
+ * @buf: data node to check
+ * @ofs: offset in eraseblock where data node starts
+ */
+int
+chfs_scan_check_data_node(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, void *buf, off_t ofs)
+{
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+	int err;
+	uint32_t crc, vno;
+	struct chfs_node_ref *nref;
+	struct chfs_vnode_cache *vc;
+	struct chfs_flash_data_node *dnode = buf;
+
+	crc = crc32(0, (uint8_t *)dnode, sizeof(struct chfs_flash_data_node) - 4);
+	if (crc != le32toh(dnode->node_crc)) {
+		err = chfs_update_eb_dirty(chmp, cheb, le32toh(dnode->length));
+		if (err)
+			return err;
+		return CHFS_NODE_BADCRC;
+	}
+	/**
+	 * Don't check data nodes crc and version here, it will be done in
+	 * the background GC thread.
+	 */
+	nref = chfs_alloc_node_ref(cheb);
+	if (!nref)
+		return ENOMEM;
+
+	nref->nref_offset = ofs | CHFS_UNCHECKED_NODE_MASK;
+
+	KASSERT(nref->nref_lnr == cheb->lnr);
+
+	vno = le64toh(dnode->vno);
+	mutex_enter(&chmp->chm_lock_vnocache);
+	vc = chfs_vnode_cache_get(chmp, vno);
+	if (!vc) {
+		vc = chfs_scan_make_vnode_cache(chmp, vno);
+		if (!vc)
+			return ENOMEM;
+	}
+	mutex_exit(&chmp->chm_lock_vnocache);
+	chfs_add_node_to_list(chmp, vc, nref, &vc->dnode);
+
+	dbg("chmpfree: %u, chebfree: %u, dnode: %u\n", chmp->chm_free_size, cheb->free_size, dnode->length);
+
+	mutex_enter(&chmp->chm_lock_sizes);
+	chfs_change_size_free(chmp, cheb, -dnode->length);
+	chfs_change_size_unchecked(chmp, cheb, dnode->length);
+	mutex_exit(&chmp->chm_lock_sizes);
+	return CHFS_NODE_OK;
+}
+
+/**
+ * chfs_scan_classify_cheb - determine eraseblock's state
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock to classify
+ */
+int
+chfs_scan_classify_cheb(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb)
+{
+	if (cheb->free_size == chmp->chm_ebh->eb_size)
+		return CHFS_BLK_STATE_FREE;
+	else if (cheb->dirty_size < MAX_DIRTY_TO_CLEAN)
+		return CHFS_BLK_STATE_CLEAN;
+	else if (cheb->used_size || cheb->unchecked_size)
+		return CHFS_BLK_STATE_PARTDIRTY;
+	else
+		return CHFS_BLK_STATE_ALLDIRTY;
+}
+
+
+/**
+ * chfs_scan_eraseblock - scans an eraseblock and looking for nodes
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock to scan
+ *
+ * This function scans a whole eraseblock, checks the nodes on it and add them
+ * to the vnode cache.
+ * Returns eraseblock state on success, error code if fails.
+ */
+int
+chfs_scan_eraseblock(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb) {
+
+	int err;
+	size_t len, retlen;
+	off_t ofs = 0;
+	int lnr = cheb->lnr;
+	u_char *buf;
+	struct chfs_flash_node_hdr *nhdr;
+	int read_free = 0;
+	struct chfs_node_ref *nref;
+
+
+	dbg("scanning eraseblock content: %d free_size: %d\n", cheb->lnr, cheb->free_size);
+	dbg("scanned physical block: %d\n", chmp->chm_ebh->lmap[lnr]);
+	buf = kmem_alloc(CHFS_MAX_NODE_SIZE, KM_SLEEP);
+
+	while((ofs + CHFS_NODE_HDR_SIZE) < chmp->chm_ebh->eb_size) {
+		memset(buf, 0 , CHFS_MAX_NODE_SIZE);
+		err = chfs_read_leb(chmp,
+		    lnr, buf, ofs, CHFS_NODE_HDR_SIZE, &retlen);
+		if (err) {
+			return err;
+		}
+
+		if (retlen != CHFS_NODE_HDR_SIZE) {
+			chfs_err("Error reading node header: "
+			    "read: %zu instead of: %zu\n",
+			    CHFS_NODE_HDR_SIZE, retlen);
+			return EIO;
+		}
+
+		/* first we check if the buffer we read is full with 0xff, if yes maybe
+		 * the blocks remaining area is free. We increase read_free and if it
+		 * reaches MAX_READ_FREE we stop reading the block*/
+		if (check_pattern(buf, 0xff, 0, CHFS_NODE_HDR_SIZE)) {
+			read_free += CHFS_NODE_HDR_SIZE;
+			if (read_free >= MAX_READ_FREE(chmp)) {
+				dbg("rest of the block is free. Size: %d\n", cheb->free_size);
+				return chfs_scan_classify_cheb(chmp, cheb);
+			}
+			ofs += CHFS_NODE_HDR_SIZE;
+			continue;
+		} else {
+			chfs_update_eb_dirty(chmp, cheb, read_free);
+			read_free = 0;
+		}
+
+		nhdr = (struct chfs_flash_node_hdr *)buf;
+
+		err = chfs_scan_check_node_hdr(nhdr);
+		if (err) {
+			dbg("node hdr error\n");
+			err = chfs_update_eb_dirty(chmp, cheb, 4);
+			if (err) {
+				return err;
+			}
+
+			ofs += 4;
+			continue;
+		}
+		ofs += CHFS_NODE_HDR_SIZE;
+		if (ofs > chmp->chm_ebh->eb_size) {
+			chfs_err("Second part of node is on the next eraseblock.\n");
+			return EIO;
+		}
+		switch (le16toh(nhdr->type)) {
+		case CHFS_NODETYPE_VNODE:
+			/* Read up the node */
+			//dbg("nodetype vnode\n");
+			len = le32toh(nhdr->length) - CHFS_NODE_HDR_SIZE;
+			err = chfs_read_leb(chmp,
+			    lnr, buf + CHFS_NODE_HDR_SIZE,
+			    ofs, len,  &retlen);
+			if (err) {
+				return err;
+			}
+
+			if (retlen != len) {
+				chfs_err("Error reading vnode: read: %zu instead of: %zu\n",
+				    len, retlen);
+				return EIO;
+			}
+			KASSERT(lnr == cheb->lnr);
+			err = chfs_scan_check_vnode(chmp,
+			    cheb, buf, ofs - CHFS_NODE_HDR_SIZE);
+			if (err) {
+				return err;
+			}
+
+			//dbg("XXX5end\n");
+			break;
+		case CHFS_NODETYPE_DIRENT:
+			/* Read up the node */
+			//dbg("nodetype dirent\n");
+			len = le32toh(nhdr->length) - CHFS_NODE_HDR_SIZE;
+
+			err = chfs_read_leb(chmp,
+			    lnr, buf + CHFS_NODE_HDR_SIZE,
+			    ofs, len, &retlen);
+			if (err) {
+				return err;
+			}
+
+			if (retlen != len) {
+				chfs_err("Error reading dirent node: read: %zu "
+				    "instead of: %zu\n", len, retlen);
+				return EIO;
+			}
+
+			KASSERT(lnr == cheb->lnr);
+
+			err = chfs_scan_check_dirent_node(chmp,
+			    cheb, buf, ofs - CHFS_NODE_HDR_SIZE);
+			if (err) {
+				return err;
+			}
+
+			//dbg("XXX6end\n");
+			break;
+		case CHFS_NODETYPE_DATA:
+			//dbg("nodetype data\n");
+			len = sizeof(struct chfs_flash_data_node) -
+			    CHFS_NODE_HDR_SIZE;
+			err = chfs_read_leb(chmp,
+			    lnr, buf + CHFS_NODE_HDR_SIZE,
+			    ofs, len, &retlen);
+			if (err) {
+				return err;
+			}
+
+			if (retlen != len) {
+				chfs_err("Error reading data node: read: %zu "
+				    "instead of: %zu\n", len, retlen);
+				return EIO;
+			}
+			KASSERT(lnr == cheb->lnr);
+			err = chfs_scan_check_data_node(chmp,
+			    cheb, buf, ofs - CHFS_NODE_HDR_SIZE);
+			if (err)
+				return err;
+
+			//dbg("XXX7end\n");
+			break;
+		case CHFS_NODETYPE_PADDING:
+			//dbg("nodetype padding\n");
+			//dbg("padding len: %d\n", le32toh(nhdr->length));
+			//dbg("BEF: cheb->free_size: %d\n", cheb->free_size);
+			nref = chfs_alloc_node_ref(cheb);
+			nref->nref_offset = ofs - CHFS_NODE_HDR_SIZE;
+			nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) |
+			    CHFS_OBSOLETE_NODE_MASK;
+
+			err = chfs_update_eb_dirty(chmp, cheb,
+			    le32toh(nhdr->length));
+			//dbg("AFT: cheb->free_size: %d\n", cheb->free_size);
+			if (err)
+				return err;
+
+			//dbg("XXX8end\n");
+			break;
+		default:
+			//dbg("nodetype ? (default)\n");
+			/* Unknown node type, update dirty and skip */
+			err = chfs_update_eb_dirty(chmp, cheb,
+			    le32toh(nhdr->length));
+			if (err)
+				return err;
+
+			//dbg("XXX9end\n");
+			break;
+		}
+		ofs += le32toh(nhdr->length) - CHFS_NODE_HDR_SIZE;
+	}
+
+	KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size +
+	    cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size);
+
+	//dbg("XXX10\n");
+	return chfs_scan_classify_cheb(chmp, cheb);
+}
diff --git a/sys/ufs/chfs/chfs_subr.c b/sys/ufs/chfs/chfs_subr.c
new file mode 100644
index 000000000..00cd82f32
--- /dev/null
+++ b/sys/ufs/chfs/chfs_subr.c
@@ -0,0 +1,540 @@
+/*	$NetBSD: chfs_subr.c,v 1.2 2011/11/24 21:09:37 agc Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Efficient memory file system supporting functions.
+ */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/dirent.h>
+#include <sys/event.h>
+#include <sys/kmem.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <sys/systm.h>
+#include <sys/swap.h>
+#include <sys/vnode.h>
+#include <sys/kauth.h>
+#include <sys/proc.h>
+#include <sys/atomic.h>
+
+#include <uvm/uvm.h>
+
+#include <miscfs/specfs/specdev.h>
+#include "chfs.h"
+//#include <fs/chfs/chfs_vnops.h>
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Returns information about the number of available memory pages,
+ * including physical and virtual ones.
+ *
+ * If 'total' is true, the value returned is the total amount of memory
+ * pages configured for the system (either in use or free).
+ * If it is FALSE, the value returned is the amount of free memory pages.
+ *
+ * Remember to remove DUMMYFS_PAGES_RESERVED from the returned value to avoid
+ * excessive memory usage.
+ *
+ */
+size_t
+chfs_mem_info(bool total)
+{
+	size_t size;
+
+	size = 0;
+	size += uvmexp.swpgavail;
+	if (!total) {
+		size -= uvmexp.swpgonly;
+	}
+	size += uvmexp.free;
+	size += uvmexp.filepages;
+	if (size > uvmexp.wired) {
+		size -= uvmexp.wired;
+	} else {
+		size = 0;
+	}
+
+	return size;
+}
+
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Looks for a directory entry in the directory represented by node.
+ * 'cnp' describes the name of the entry to look for.  Note that the .
+ * and .. components are not allowed as they do not physically exist
+ * within directories.
+ *
+ * Returns a pointer to the entry when found, otherwise NULL.
+ */
+struct chfs_dirent *
+chfs_dir_lookup(struct chfs_inode *ip, struct componentname *cnp)
+{
+	bool found;
+	struct chfs_dirent *fd;
+	dbg("dir_lookup()\n");
+
+	KASSERT(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.'));
+	KASSERT(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' &&
+		    cnp->cn_nameptr[1] == '.')));
+	//CHFS_VALIDATE_DIR(node);
+
+	//node->chn_status |= CHFS_NODE_ACCESSED;
+
+	found = false;
+//	fd = ip->dents;
+//	while(fd) {
+	TAILQ_FOREACH(fd, &ip->dents, fds) {
+		KASSERT(cnp->cn_namelen < 0xffff);
+		if (fd->vno == 0)
+			continue;
+		/*dbg("dirent dump:\n");
+		  dbg(" ->vno:     %d\n", fd->vno);
+		  dbg(" ->version: %ld\n", fd->version);
+		  dbg(" ->nhash:   0x%x\n", fd->nhash);
+		  dbg(" ->nsize:   %d\n", fd->nsize);
+		  dbg(" ->name:    %s\n", fd->name);
+		  dbg(" ->type:    %d\n", fd->type);*/
+		if (fd->nsize == (uint16_t)cnp->cn_namelen &&
+		    memcmp(fd->name, cnp->cn_nameptr, fd->nsize) == 0) {
+			found = true;
+			break;
+		}
+//		fd = fd->next;
+	}
+
+	return found ? fd : NULL;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_filldir(struct uio* uio, ino_t ino, const char *name,
+    int namelen, enum vtype type)
+{
+	struct dirent dent;
+	int error;
+
+	memset(&dent, 0, sizeof(dent));
+
+	dent.d_fileno = ino;
+	switch (type) {
+	case VBLK:
+		dent.d_type = DT_BLK;
+		break;
+
+	case VCHR:
+		dent.d_type = DT_CHR;
+		break;
+
+	case VDIR:
+		dent.d_type = DT_DIR;
+		break;
+
+	case VFIFO:
+		dent.d_type = DT_FIFO;
+		break;
+
+	case VLNK:
+		dent.d_type = DT_LNK;
+		break;
+
+	case VREG:
+		dent.d_type = DT_REG;
+		break;
+
+	case VSOCK:
+		dent.d_type = DT_SOCK;
+		break;
+
+	default:
+		KASSERT(0);
+	}
+	dent.d_namlen = namelen;
+	(void)memcpy(dent.d_name, name, dent.d_namlen);
+	dent.d_reclen = _DIRENT_SIZE(&dent);
+
+	if (dent.d_reclen > uio->uio_resid) {
+		error = -1;
+	} else {
+		error = uiomove(&dent, dent.d_reclen, uio);
+	}
+
+	return error;
+}
+
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Change size of the given vnode.
+ * Caller should execute chfs_update on vp after a successful execution.
+ * The vnode must be locked on entry and remain locked on exit.
+ */
+int
+chfs_chsize(struct vnode *vp, u_quad_t size, kauth_cred_t cred)
+{
+	struct chfs_mount *chmp;
+	struct chfs_inode *ip;
+	struct buf *bp;
+	int blknum, append;
+	int error = 0;
+	char *buf = NULL;
+	struct chfs_full_dnode *fd;
+
+	ip = VTOI(vp);
+	chmp = ip->chmp;
+
+	dbg("chfs_chsize\n");
+
+	switch (vp->v_type) {
+	case VDIR:
+		return EISDIR;
+	case VLNK:
+	case VREG:
+		if (vp->v_mount->mnt_flag & MNT_RDONLY)
+			return EROFS;
+		break;
+	case VBLK:
+	case VCHR:
+	case VFIFO:
+		return 0;
+	default:
+		return EOPNOTSUPP; /* XXX why not ENODEV? */
+	}
+
+	vflushbuf(vp, 0);
+
+	mutex_enter(&chmp->chm_lock_mountfields);
+	chfs_flush_pending_wbuf(chmp);
+
+	/* handle truncate to zero as a special case */
+	if (size == 0) {
+		dbg("truncate to zero");
+		chfs_truncate_fragtree(ip->chmp,
+		    &ip->fragtree, size);
+		chfs_set_vnode_size(vp, size);
+
+		mutex_exit(&chmp->chm_lock_mountfields);
+
+		return 0;
+	}
+
+
+	/* allocate zeros for the new data */
+	buf = kmem_zalloc(size, KM_SLEEP);
+	bp = getiobuf(vp, true);
+
+	if (ip->size != 0) {
+		/* read the whole data */
+		bp->b_blkno = 0;
+		bp->b_bufsize = bp->b_resid = bp->b_bcount = ip->size;
+		bp->b_data = kmem_alloc(ip->size, KM_SLEEP);
+
+		error = chfs_read_data(chmp, vp, bp);
+		if (error) {
+			mutex_exit(&chmp->chm_lock_mountfields);
+			putiobuf(bp);
+
+			return error;
+		}
+
+		/* create the new data */
+		dbg("create new data vap%llu ip%llu\n",
+			(unsigned long long)size, (unsigned long long)ip->size);
+		append = size - ip->size;
+		if (append > 0) {
+			memcpy(buf, bp->b_data, ip->size);
+		} else {
+			memcpy(buf, bp->b_data, size);
+			chfs_truncate_fragtree(ip->chmp,
+				&ip->fragtree, size);
+		}
+
+		kmem_free(bp->b_data, ip->size);
+
+		struct chfs_node_frag *lastfrag = frag_last(&ip->fragtree);
+		fd = lastfrag->node;
+		chfs_mark_node_obsolete(chmp, fd->nref);
+
+		blknum = lastfrag->ofs / PAGE_SIZE;
+		lastfrag->size = append > PAGE_SIZE ? PAGE_SIZE : size % PAGE_SIZE;
+	} else {
+		fd = chfs_alloc_full_dnode();
+		blknum = 0;
+	}
+
+	chfs_set_vnode_size(vp, size);
+
+	// write the new data
+	for (bp->b_blkno = blknum; bp->b_blkno * PAGE_SIZE < size; bp->b_blkno++) {
+		uint64_t writesize = MIN(size - bp->b_blkno * PAGE_SIZE, PAGE_SIZE);
+
+		bp->b_bufsize = bp->b_resid = bp->b_bcount = writesize;
+		bp->b_data = kmem_alloc(writesize, KM_SLEEP);
+
+		memcpy(bp->b_data, buf + (bp->b_blkno * PAGE_SIZE), writesize);
+
+		if (bp->b_blkno != blknum) {
+			fd = chfs_alloc_full_dnode();
+		}
+
+		error = chfs_write_flash_dnode(chmp, vp, bp, fd);
+		if (error) {
+			mutex_exit(&chmp->chm_lock_mountfields);
+			kmem_free(bp->b_data, writesize);
+			putiobuf(bp);
+
+			return error;
+		}
+		if (bp->b_blkno != blknum) {
+			chfs_add_full_dnode_to_inode(chmp, ip, fd);
+		}
+		kmem_free(bp->b_data, writesize);
+	}
+
+	mutex_exit(&chmp->chm_lock_mountfields);
+
+	kmem_free(buf, size);
+	putiobuf(bp);
+
+	return 0;
+}
+#if 0
+	int error;
+	struct chfs_node *node;
+
+	KASSERT(VOP_ISLOCKED(vp));
+
+	node = VP_TO_CHFS_NODE(vp);
+
+	// Decide whether this is a valid operation based on the file type.
+	error = 0;
+	switch (vp->v_type) {
+	case VDIR:
+		return EISDIR;
+
+	case VREG:
+		if (vp->v_mount->mnt_flag & MNT_RDONLY)
+			return EROFS;
+		break;
+
+	case VBLK:
+	case VCHR:
+	case VFIFO:
+		// Allow modifications of special files even if in the file
+		// system is mounted read-only (we are not modifying the
+		// files themselves, but the objects they represent).
+		return 0;
+
+	default:
+		return ENODEV;
+	}
+
+	// Immutable or append-only files cannot be modified, either.
+	if (node->chn_flags & (IMMUTABLE | APPEND))
+		return EPERM;
+
+	error = chfs_truncate(vp, size);
+	// chfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents
+	// for us, as will update dn_status; no need to do that here.
+
+	KASSERT(VOP_ISLOCKED(vp));
+
+	return error;
+#endif
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Change flags of the given vnode.
+ * Caller should execute chfs_update on vp after a successful execution.
+ * The vnode must be locked on entry and remain locked on exit.
+ */
+int
+chfs_chflags(struct vnode *vp, int flags, kauth_cred_t cred)
+{
+	struct chfs_mount *chmp;
+	struct chfs_inode *ip;
+	int error = 0;
+
+	ip = VTOI(vp);
+	chmp = ip->chmp;
+
+	if (vp->v_mount->mnt_flag & MNT_RDONLY)
+		return EROFS;
+
+	if (kauth_cred_geteuid(cred) != ip->uid &&
+	    (error = kauth_authorize_generic(cred,
+		KAUTH_GENERIC_ISSUSER, NULL)))
+		return error;
+
+	if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+		NULL) == 0) {
+		if ((ip->flags & (SF_IMMUTABLE | SF_APPEND)) &&
+		    kauth_authorize_system(curlwp->l_cred,
+			KAUTH_SYSTEM_CHSYSFLAGS, 0, NULL, NULL, NULL))
+			return EPERM;
+
+		if ((flags & SF_SNAPSHOT) !=
+		    (ip->flags & SF_SNAPSHOT))
+			return EPERM;
+
+		ip->flags = flags;
+	} else {
+		if ((ip->flags & (SF_IMMUTABLE | SF_APPEND)) ||
+		    (flags & UF_SETTABLE) != flags)
+			return EPERM;
+
+		if ((ip->flags & SF_SETTABLE) !=
+		    (flags & SF_SETTABLE))
+			return EPERM;
+
+		ip->flags &= SF_SETTABLE;
+		ip->flags |= (flags & UF_SETTABLE);
+	}
+	ip->iflag |= IN_CHANGE;
+	error = chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+	if (error)
+		return error;
+
+	if (flags & (IMMUTABLE | APPEND))
+		return 0;
+
+	return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_itimes(struct chfs_inode *ip, const struct timespec *acc,
+    const struct timespec *mod, const struct timespec *cre)
+{
+	//dbg("itimes\n");
+	struct timespec now;
+
+	if (!(ip->iflag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) {
+		return;
+	}
+
+	vfs_timestamp(&now);
+	if (ip->iflag & IN_ACCESS) {
+		if (acc == NULL)
+			acc = &now;
+		ip->atime = acc->tv_sec;
+	}
+	if (ip->iflag & (IN_UPDATE | IN_MODIFY)) {
+		if (mod == NULL)
+			mod = &now;
+		ip->mtime = mod->tv_sec;
+		//ip->i_modrev++;
+	}
+	if (ip->iflag & (IN_CHANGE | IN_MODIFY)) {
+		if (cre == NULL)
+			cre = &now;
+		ip->ctime = cre->tv_sec;
+	}
+	if (ip->iflag & (IN_ACCESS | IN_MODIFY))
+		ip->iflag |= IN_ACCESSED;
+	if (ip->iflag & (IN_UPDATE | IN_CHANGE))
+		ip->iflag |= IN_MODIFIED;
+	ip->iflag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_update(struct vnode *vp, const struct timespec *acc,
+    const struct timespec *mod, int flags)
+{
+
+	struct chfs_inode *ip;
+
+	/* XXX ufs_reclaim calls this function unlocked! */
+//	KASSERT(VOP_ISLOCKED(vp));
+
+#if 0
+	if (flags & UPDATE_CLOSE)
+		; /* XXX Need to do anything special? */
+#endif
+
+	ip = VTOI(vp);
+	chfs_itimes(ip, acc, mod, NULL);
+
+//	KASSERT(VOP_ISLOCKED(vp));
+	return (0);
+}
+
+/* --------------------------------------------------------------------- */
+/*
+  int
+  chfs_truncate(struct vnode *vp, off_t length)
+  {
+  bool extended;
+  int error;
+  struct chfs_node *node;
+  printf("CHFS: truncate()\n");
+
+  node = VP_TO_CHFS_NODE(vp);
+  extended = length > node->chn_size;
+
+  if (length < 0) {
+  error = EINVAL;
+  goto out;
+  }
+
+  if (node->chn_size == length) {
+  error = 0;
+  goto out;
+  }
+
+  error = chfs_reg_resize(vp, length);
+  if (error == 0)
+  node->chn_status |= CHFS_NODE_CHANGED | CHFS_NODE_MODIFIED;
+
+  out:
+  chfs_update(vp, NULL, NULL, 0);
+
+  return error;
+  }*/
+
+
diff --git a/sys/ufs/chfs/chfs_vfsops.c b/sys/ufs/chfs/chfs_vfsops.c
new file mode 100644
index 000000000..a08d15fb7
--- /dev/null
+++ b/sys/ufs/chfs/chfs_vfsops.c
@@ -0,0 +1,847 @@
+/*	$NetBSD: chfs_vfsops.c,v 1.2 2011/11/24 21:09:37 agc Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/module.h>
+#include <sys/namei.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+#include <sys/conf.h>
+#include <sys/buf.h>
+//XXX needed just for debugging
+#include <sys/fstrans.h>
+#include <sys/sleepq.h>
+#include <sys/lockdebug.h>
+#include <sys/ktrace.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_pager.h>
+#include <ufs/ufs/dir.h>
+//#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/genfs/genfs_node.h>
+#include <miscfs/specfs/specdev.h>
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+//#include </root/xipffs/netbsd.chfs/chfs_args.h>
+#include "chfs.h"
+#include "chfs_args.h"
+
+MODULE(MODULE_CLASS_VFS, chfs, "flash");
+
+/* --------------------------------------------------------------------- */
+/* functions */
+
+static int chfs_mount(struct mount *, const char *, void *, size_t *);
+static int chfs_unmount(struct mount *, int);
+static int chfs_root(struct mount *, struct vnode **);
+static int chfs_vget(struct mount *, ino_t, struct vnode **);
+static int chfs_fhtovp(struct mount *, struct fid *, struct vnode **);
+static int chfs_vptofh(struct vnode *, struct fid *, size_t *);
+static int chfs_start(struct mount *, int);
+static int chfs_statvfs(struct mount *, struct statvfs *);
+static int chfs_sync(struct mount *, int, kauth_cred_t);
+static void chfs_init(void);
+static void chfs_reinit(void);
+static void chfs_done(void);
+static int chfs_snapshot(struct mount *, struct vnode *,
+    struct timespec *);
+
+/* --------------------------------------------------------------------- */
+/* structures */
+
+int
+chfs_gop_alloc(struct vnode *vp, off_t off, off_t len,  int flags,
+    kauth_cred_t cred)
+{
+	return (0);
+}
+
+const struct genfs_ops chfs_genfsops = {
+	.gop_size = genfs_size,
+	.gop_alloc = chfs_gop_alloc,
+	.gop_write = genfs_gop_write,
+	.gop_markupdate = ufs_gop_markupdate,
+};
+
+/*
+static const struct ufs_ops chfs_ufsops = {
+	.uo_itimes = chfs_itimes,
+	.uo_update = chfs_update,
+};
+*/
+
+struct pool chfs_inode_pool;
+
+/* for looking up the major for flash */
+extern const struct cdevsw flash_cdevsw;
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_mount(struct mount *mp,
+    const char *path, void *data, size_t *data_len)
+{
+	struct lwp *l = curlwp;
+	struct nameidata nd;
+	struct pathbuf *pb;
+	struct vnode *devvp = NULL;
+	struct ufs_args *args = data;
+	struct ufsmount *ump = NULL;
+	struct chfs_mount *chmp;
+	int err = 0;
+	int xflags;
+
+	dbg("mount()\n");
+
+	if (*data_len < sizeof *args)
+		return EINVAL;
+
+	if (mp->mnt_flag & MNT_GETARGS) {
+		ump = VFSTOUFS(mp);
+		if (ump == NULL)
+			return EIO;
+		memset(args, 0, sizeof *args);
+		args->fspec = NULL;
+		*data_len = sizeof *args;
+		return 0;
+	}
+
+	if (mp->mnt_flag & MNT_UPDATE) {
+		/* XXX: There is no support yet to update file system
+		 * settings.  Should be added. */
+
+		return ENODEV;
+	}
+
+	if (args->fspec != NULL) {
+		err = pathbuf_copyin(args->fspec, &pb);
+		if (err) {
+			return err;
+		}
+		/*
+		 * Look up the name and verify that it's sane.
+		 */
+		NDINIT(&nd, LOOKUP, FOLLOW, pb);
+		if ((err = namei(&nd)) != 0 )
+			return (err);
+		devvp = nd.ni_vp;
+
+		/*
+		 * Be sure this is a valid block device
+		 */
+		if (devvp->v_type != VBLK)
+			err = ENOTBLK;
+		else if (bdevsw_lookup(devvp->v_rdev) == NULL)
+			err = ENXIO;
+	}
+
+	if (err) {
+		vrele(devvp);
+		return (err);
+	}
+
+	if (mp->mnt_flag & MNT_RDONLY)
+		xflags = FREAD;
+	else
+		xflags = FREAD|FWRITE;
+
+	err = VOP_OPEN(devvp, xflags, FSCRED);
+	if (err)
+		goto fail;
+
+
+	err = chfs_mountfs(devvp, mp);
+	if (err) {
+		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+		(void)VOP_CLOSE(devvp, xflags, NOCRED);
+		VOP_UNLOCK(devvp);
+		goto fail;
+	}
+	ump = VFSTOUFS(mp);
+	chmp = ump->um_chfs;
+
+	vfs_getnewfsid(mp);
+	chmp->chm_fsmp = mp;
+
+	return set_statvfs_info(path,
+	    UIO_USERSPACE, args->fspec,
+	    UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+
+fail:
+	vrele(devvp);
+	return (err);
+}
+
+
+int
+chfs_mountfs(struct vnode *devvp, struct mount *mp)
+{
+	struct lwp *l = curlwp;
+	struct proc *p;
+	kauth_cred_t cred;
+	devmajor_t flash_major;
+	dev_t dev;
+	struct ufsmount* ump = NULL;
+	struct chfs_mount* chmp;
+	struct vnode *vp;
+	int err = 0;
+
+	dbg("mountfs()\n");
+
+	dev = devvp->v_rdev;
+	p = l ? l->l_proc : NULL;
+	cred = l ? l->l_cred : NOCRED;
+
+	/* Flush out any old buffers remaining from a previous use. */
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+	err = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
+	VOP_UNLOCK(devvp);
+	if (err)
+		return (err);
+
+	flash_major = cdevsw_lookup_major(&flash_cdevsw);
+
+	if (devvp->v_type != VBLK)
+		err = ENOTBLK;
+	else if (bdevsw_lookup(dev) == NULL)
+		err = ENXIO;
+	else if (major(dev) != flash_major) {
+		dbg("major(dev): %d, flash_major: %d\n",
+		    major(dev), flash_major);
+		err = ENODEV;
+	}
+	if (err) {
+		vrele(devvp);
+		return (err);
+	}
+
+	ump = malloc(sizeof(*ump), M_UFSMNT, M_WAITOK);
+	memset(ump, 0, sizeof(*ump));
+	ump->um_fstype = UFS1;
+	//ump->um_ops = &chfs_ufsops;
+	ump->um_chfs = malloc(sizeof(struct chfs_mount),
+	    M_UFSMNT, M_WAITOK);
+	memset(ump->um_chfs, 0, sizeof(struct chfs_mount));
+
+	mutex_init(&ump->um_lock, MUTEX_DEFAULT, IPL_NONE);
+
+	/* Get superblock and set flash device number */
+	chmp = ump->um_chfs;
+	if (!chmp)
+		return ENOMEM;
+
+	chmp->chm_ebh = kmem_alloc(sizeof(struct chfs_ebh), KM_SLEEP);
+
+	dbg("[]opening flash: %u\n", (unsigned int)devvp->v_rdev);
+	err = ebh_open(chmp->chm_ebh, devvp->v_rdev);
+	if (err) {
+		dbg("error while opening flash\n");
+		kmem_free(chmp->chm_ebh, sizeof(struct chfs_ebh));
+		free(chmp, M_UFSMNT);
+		return err;
+	}
+
+	//TODO check flash sizes
+
+	chmp->chm_gbl_version = 0;
+	chmp->chm_vnocache_hash = chfs_vnocache_hash_init();
+
+	chmp->chm_blocks = kmem_zalloc(chmp->chm_ebh->peb_nr *
+	    sizeof(struct chfs_eraseblock), KM_SLEEP);
+
+	if (!chmp->chm_blocks) {
+		kmem_free(chmp->chm_ebh, chmp->chm_ebh->peb_nr *
+		    sizeof(struct chfs_eraseblock));
+		ebh_close(chmp->chm_ebh);
+		free(chmp, M_UFSMNT);
+		return ENOMEM;
+	}
+
+	mutex_init(&chmp->chm_lock_mountfields, MUTEX_DEFAULT, IPL_NONE);
+	mutex_init(&chmp->chm_lock_sizes, MUTEX_DEFAULT, IPL_NONE);
+	mutex_init(&chmp->chm_lock_vnocache, MUTEX_DEFAULT, IPL_NONE);
+
+	//XXX
+	chmp->chm_fs_bmask = -4096;
+	chmp->chm_fs_bsize = 4096;
+	chmp->chm_fs_qbmask = 4095;
+	chmp->chm_fs_bshift = 12;
+	chmp->chm_fs_fmask = -2048;
+	chmp->chm_fs_qfmask = 2047;
+
+	chmp->chm_wbuf_pagesize = chmp->chm_ebh->flash_if->page_size;
+	dbg("wbuf size: %zu\n", chmp->chm_wbuf_pagesize);
+	chmp->chm_wbuf = kmem_alloc(chmp->chm_wbuf_pagesize, KM_SLEEP);
+	rw_init(&chmp->chm_lock_wbuf);
+
+	//init queues
+	TAILQ_INIT(&chmp->chm_free_queue);
+	TAILQ_INIT(&chmp->chm_clean_queue);
+	TAILQ_INIT(&chmp->chm_dirty_queue);
+	TAILQ_INIT(&chmp->chm_very_dirty_queue);
+	TAILQ_INIT(&chmp->chm_erasable_pending_wbuf_queue);
+	TAILQ_INIT(&chmp->chm_erase_pending_queue);
+
+	chfs_calc_trigger_levels(chmp);
+
+	chmp->chm_nr_free_blocks = 0;
+	chmp->chm_nr_erasable_blocks = 0;
+	chmp->chm_max_vno = 2;
+	chmp->chm_checked_vno = 2;
+	chmp->chm_unchecked_size = 0;
+	chmp->chm_used_size = 0;
+	chmp->chm_dirty_size = 0;
+	chmp->chm_wasted_size = 0;
+	chmp->chm_free_size = chmp->chm_ebh->eb_size * chmp->chm_ebh->peb_nr;
+	err = chfs_build_filesystem(chmp);
+
+	if (err) {
+		chfs_vnocache_hash_destroy(chmp->chm_vnocache_hash);
+		kmem_free(chmp->chm_ebh, chmp->chm_ebh->peb_nr *
+		    sizeof(struct chfs_eraseblock));
+		ebh_close(chmp->chm_ebh);
+		free(chmp, M_UFSMNT);
+		return EIO;
+	}
+
+	mp->mnt_data = ump;
+	mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
+	mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_CHFS);
+	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
+	mp->mnt_stat.f_namemax = MAXNAMLEN;
+	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_fs_bshift = PAGE_SHIFT;
+	mp->mnt_dev_bshift = DEV_BSHIFT;
+	mp->mnt_iflag |= IMNT_MPSAFE;
+	ump->um_flags = 0;
+	ump->um_mountp = mp;
+	ump->um_dev = dev;
+	ump->um_devvp = devvp;
+	ump->um_maxfilesize = 1048512 * 1024;
+	/*TODO fill these fields
+	  ump->um_nindir =
+	  ump->um_lognindir =
+	  ump->um_bptrtodb =
+	  ump->um_seqinc =
+	  ump->um_maxsymlinklen =
+	  ump->um_dirblksiz =
+	  ump->um_maxfilesize =
+	*/
+
+	/*
+	 * Allocate the root vnode.
+	 */
+	err = VFS_VGET(mp, CHFS_ROOTINO, &vp);
+	if (err) {
+		dbg("error: %d while allocating root node\n", err);
+		return err;
+	}
+	vput(vp);
+
+	chfs_gc_thread_start(chmp);
+	mutex_enter(&chmp->chm_lock_mountfields);
+	chfs_gc_trigger(chmp);
+	mutex_exit(&chmp->chm_lock_mountfields);
+
+	devvp->v_specmountpoint = mp;
+	return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+/* ARGSUSED2 */
+static int
+chfs_unmount(struct mount *mp, int mntflags)
+{
+	int flags = 0, i = 0;
+	struct ufsmount *ump;
+	struct chfs_mount *chmp;
+//	struct chfs_vnode_cache *vc, *next;
+
+	if (mntflags & MNT_FORCE)
+		flags |= FORCECLOSE;
+
+	dbg("[START]\n");
+
+	ump = VFSTOUFS(mp);
+	chmp = ump->um_chfs;
+
+	chfs_gc_thread_stop(chmp);
+
+	(void)vflush(mp, NULLVP, flags);
+
+	if (chmp->chm_wbuf_len) {
+		mutex_enter(&chmp->chm_lock_mountfields);
+		chfs_flush_pending_wbuf(chmp);
+		mutex_exit(&chmp->chm_lock_mountfields);
+	}
+
+	for (i = 0; i < chmp->chm_ebh->peb_nr; i++) {
+		chfs_free_node_refs(&chmp->chm_blocks[i]);
+	}
+
+	chfs_vnocache_hash_destroy(chmp->chm_vnocache_hash);
+
+	ebh_close(chmp->chm_ebh);
+
+	rw_destroy(&chmp->chm_lock_wbuf);
+	mutex_destroy(&chmp->chm_lock_vnocache);
+	mutex_destroy(&chmp->chm_lock_sizes);
+	mutex_destroy(&chmp->chm_lock_mountfields);
+
+	if (ump->um_devvp->v_type != VBAD) {
+		ump->um_devvp->v_specmountpoint = NULL;
+	}
+	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+	(void)VOP_CLOSE(ump->um_devvp, FREAD|FWRITE, NOCRED);
+	vput(ump->um_devvp);
+
+	mutex_destroy(&ump->um_lock);
+
+	//free(ump->um_chfs, M_UFSMNT);
+	free(ump, M_UFSMNT);
+	mp->mnt_data = NULL;
+	mp->mnt_flag &= ~MNT_LOCAL;
+	dbg("[END]\n");
+	return (0);
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_root(struct mount *mp, struct vnode **vpp)
+{
+	struct vnode *vp;
+	int error;
+
+	if ((error = VFS_VGET(mp, (ino_t)ROOTINO, &vp)) != 0)
+		return error;
+	*vpp = vp;
+	return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+extern rb_tree_ops_t frag_rbtree_ops;
+
+static int
+chfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
+{
+	struct chfs_mount *chmp;
+	struct chfs_inode *ip;
+	struct ufsmount *ump;
+	struct vnode *vp;
+	dev_t dev;
+	int error;
+	struct chfs_vnode_cache* chvc = NULL;
+	struct chfs_node_ref* nref = NULL;
+	struct buf *bp;
+
+	dbg("vget() | ino: %llu\n", (unsigned long long)ino);
+
+	ump = VFSTOUFS(mp);
+	dev = ump->um_dev;
+retry:
+	if (!vpp) {
+		vpp = kmem_alloc(sizeof(struct vnode*), KM_SLEEP);
+	}
+
+	if ((*vpp = chfs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL) {
+		return 0;
+	}
+
+	/* Allocate a new vnode/inode. */
+	if ((error = getnewvnode(VT_CHFS,
+		    mp, chfs_vnodeop_p, NULL, &vp)) != 0) {
+		*vpp = NULL;
+		return (error);
+	}
+	ip = pool_get(&chfs_inode_pool, PR_WAITOK);
+
+	mutex_enter(&chfs_hashlock);
+	if ((*vpp = chfs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL) {
+		mutex_exit(&chfs_hashlock);
+		ungetnewvnode(vp);
+		pool_put(&chfs_inode_pool, ip);
+		goto retry;
+	}
+
+	vp->v_vflag |= VV_LOCKSWORK;
+
+	memset(ip, 0, sizeof(*ip));
+	vp->v_data = ip;
+	ip->vp = vp;
+	ip->ump = ump;
+	ip->chmp = chmp = ump->um_chfs;
+	ip->dev = dev;
+	ip->ino = ino;
+	vp->v_mount = mp;
+	genfs_node_init(vp, &chfs_genfsops);
+
+	rb_tree_init(&ip->fragtree, &frag_rbtree_ops);
+	//mutex_init(&ip->inode_lock, MUTEX_DEFAULT, IPL_NONE);
+
+	chfs_ihashins(ip);
+	mutex_exit(&chfs_hashlock);
+
+	// set root inode
+	if (ino == CHFS_ROOTINO) {
+		dbg("SETROOT\n");
+		vp->v_vflag |= VV_ROOT;
+		vp->v_type = VDIR;
+		ip->mode = IFMT | IEXEC | IWRITE | IREAD;
+		ip->iflag |= (IN_ACCESS | IN_CHANGE | IN_UPDATE);
+		chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+//		ip->dents = NULL; XXXTAILQ
+		TAILQ_INIT(&ip->dents);
+		chfs_set_vnode_size(vp, 512);
+	}
+
+	// set vnode cache
+	mutex_enter(&chmp->chm_lock_vnocache);
+	chvc = chfs_vnode_cache_get(chmp, ino);
+	mutex_exit(&chmp->chm_lock_vnocache);
+	if (!chvc) {
+		dbg("!chvc\n");
+		/* XXX, we cant alloc under a lock, refactor this! */
+		chvc = chfs_vnode_cache_alloc(ino);
+		mutex_enter(&chmp->chm_lock_vnocache);
+		if (ino == CHFS_ROOTINO) {
+			chvc->nlink = 2;
+			chvc->pvno = CHFS_ROOTINO;
+			chfs_vnode_cache_set_state(chmp,
+			    chvc, VNO_STATE_CHECKEDABSENT);
+		}
+		chfs_vnode_cache_add(chmp, chvc);
+		mutex_exit(&chmp->chm_lock_vnocache);
+
+		ip->chvc = chvc;
+		TAILQ_INIT(&ip->dents);
+	} else {
+		dbg("chvc\n");
+		ip->chvc = chvc;
+		// if we have a vnode cache, the node is already on flash, so read it
+		if (ino == CHFS_ROOTINO) {
+			chvc->pvno = CHFS_ROOTINO;
+			TAILQ_INIT(&chvc->scan_dirents);
+		} else {
+			chfs_readvnode(mp, ino, &vp);
+		}
+
+		mutex_enter(&chmp->chm_lock_mountfields);
+		// init type specific things
+		switch (vp->v_type) {
+		case VDIR:
+			nref = chvc->dirents;
+			while (nref &&
+			    (struct chfs_vnode_cache *)nref != chvc) {
+				chfs_readdirent(mp, nref, ip);
+				nref = nref->nref_next;
+			}
+			chfs_set_vnode_size(vp, 512);
+			break;
+		case VREG:
+		case VSOCK:
+			//build the fragtree of the vnode
+			dbg("read_inode_internal | ino: %llu\n",
+				(unsigned long long)ip->ino);
+			error = chfs_read_inode(chmp, ip);
+			if (error) {
+				vput(vp);
+				*vpp = NULL;
+				mutex_exit(&chmp->chm_lock_mountfields);
+				return (error);
+			}
+			break;
+		case VLNK:
+			//build the fragtree of the vnode
+			dbg("read_inode_internal | ino: %llu\n",
+				(unsigned long long)ip->ino);
+			error = chfs_read_inode_internal(chmp, ip);
+			if (error) {
+				vput(vp);
+				*vpp = NULL;
+				mutex_exit(&chmp->chm_lock_mountfields);
+				return (error);
+			}
+
+			dbg("size: %llu\n", (unsigned long long)ip->size);
+			bp = getiobuf(vp, true);
+			bp->b_blkno = 0;
+			bp->b_bufsize = bp->b_resid =
+			    bp->b_bcount = ip->size;
+			bp->b_data = kmem_alloc(ip->size, KM_SLEEP);
+			chfs_read_data(chmp, vp, bp);
+			if (!ip->target)
+				ip->target = kmem_alloc(ip->size,
+				    KM_SLEEP);
+			memcpy(ip->target, bp->b_data, ip->size);
+			kmem_free(bp->b_data, ip->size);
+			putiobuf(bp);
+
+			break;
+		case VCHR:
+		case VBLK:
+		case VFIFO:
+			//build the fragtree of the vnode
+			dbg("read_inode_internal | ino: %llu\n",
+				(unsigned long long)ip->ino);
+			error = chfs_read_inode_internal(chmp, ip);
+			if (error) {
+				vput(vp);
+				*vpp = NULL;
+				mutex_exit(&chmp->chm_lock_mountfields);
+				return (error);
+			}
+
+			bp = getiobuf(vp, true);
+			bp->b_blkno = 0;
+			bp->b_bufsize = bp->b_resid =
+			    bp->b_bcount = sizeof(dev_t);
+			bp->b_data = kmem_alloc(sizeof(dev_t), KM_SLEEP);
+			chfs_read_data(chmp, vp, bp);
+			memcpy(&ip->rdev,
+			    bp->b_data, sizeof(dev_t));
+			kmem_free(bp->b_data, sizeof(dev_t));
+			putiobuf(bp);
+			if (vp->v_type == VFIFO)
+				vp->v_op = chfs_fifoop_p;
+			else {
+				vp->v_op = chfs_specop_p;
+				spec_node_init(vp, ip->rdev);
+			}
+
+		    break;
+		case VNON:
+		case VBAD:
+			break;
+		}
+		mutex_exit(&chmp->chm_lock_mountfields);
+
+	}
+
+	/* finish inode initalization */
+	ip->devvp = ump->um_devvp;
+	vref(ip->devvp);
+
+	uvm_vnp_setsize(vp, ip->size);
+	*vpp = vp;
+
+	return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
+{
+	return ENODEV;
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
+{
+	return ENODEV;
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_start(struct mount *mp, int flags)
+{
+	return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+/* ARGSUSED2 */
+static int
+chfs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+ 	struct chfs_mount *chmp;
+	struct ufsmount *ump;
+	dbg("statvfs\n");
+
+	ump = VFSTOUFS(mp);
+	chmp = ump->um_chfs;
+
+	sbp->f_flag   = mp->mnt_flag;
+	sbp->f_bsize  = chmp->chm_ebh->eb_size;
+	sbp->f_frsize = chmp->chm_ebh->eb_size;
+	sbp->f_iosize = chmp->chm_ebh->eb_size;
+
+	sbp->f_blocks = chmp->chm_ebh->peb_nr;
+	sbp->f_files  = 0;
+	sbp->f_bavail = chmp->chm_nr_free_blocks - chmp->chm_resv_blocks_write;
+#if 0
+	printf("chmp->chm_nr_free_blocks: %jd\n",
+	    (intmax_t )chmp->chm_nr_free_blocks);
+	printf("chmp->chm_resv_blocks_write: %jd\n",
+	    (intmax_t) chmp->chm_resv_blocks_write);
+	printf("chmp->chm_ebh->peb_nr: %jd\n",
+	    (intmax_t) chmp->chm_ebh->peb_nr);
+#endif
+
+	sbp->f_bfree = chmp->chm_nr_free_blocks;
+	sbp->f_bresvd = chmp->chm_resv_blocks_write;
+
+	/* FFS specific */
+	sbp->f_ffree  = 0;
+	sbp->f_favail = 0;
+	sbp->f_fresvd = 0;
+
+	copy_statvfs_info(sbp, mp);
+
+	return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+/* ARGSUSED0 */
+static int
+chfs_sync(struct mount *mp, int waitfor,
+    kauth_cred_t uc)
+{
+	return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+static void
+chfs_init(void)
+{
+	chfs_alloc_pool_caches();
+	chfs_ihashinit();
+	pool_init(&chfs_inode_pool, sizeof(struct chfs_inode), 0, 0, 0,
+	    "chfsinopl", &pool_allocator_nointr, IPL_NONE);
+	ufs_init();
+}
+
+/* --------------------------------------------------------------------- */
+
+static void
+chfs_reinit(void)
+{
+	chfs_ihashreinit();
+	ufs_reinit();
+}
+
+/* --------------------------------------------------------------------- */
+
+static void
+chfs_done(void)
+{
+	ufs_done();
+	chfs_ihashdone();
+	pool_destroy(&chfs_inode_pool);
+	chfs_destroy_pool_caches();
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_snapshot(struct mount *mp, struct vnode *vp,
+    struct timespec *ctime)
+{
+	return ENODEV;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * chfs vfs operations.
+ */
+
+extern const struct vnodeopv_desc chfs_fifoop_opv_desc;
+extern const struct vnodeopv_desc chfs_specop_opv_desc;
+extern const struct vnodeopv_desc chfs_vnodeop_opv_desc;
+
+const struct vnodeopv_desc * const chfs_vnodeopv_descs[] = {
+	&chfs_fifoop_opv_desc,
+	&chfs_specop_opv_desc,
+	&chfs_vnodeop_opv_desc,
+	NULL,
+};
+
+struct vfsops chfs_vfsops = {
+	MOUNT_CHFS,			/* vfs_name */
+	sizeof (struct chfs_args),
+	chfs_mount,			/* vfs_mount */
+	chfs_start,			/* vfs_start */
+	chfs_unmount,		/* vfs_unmount */
+	chfs_root,			/* vfs_root */
+	ufs_quotactl,			/* vfs_quotactl */
+	chfs_statvfs,		/* vfs_statvfs */
+	chfs_sync,			/* vfs_sync */
+	chfs_vget,			/* vfs_vget */
+	chfs_fhtovp,		/* vfs_fhtovp */
+	chfs_vptofh,		/* vfs_vptofh */
+	chfs_init,			/* vfs_init */
+	chfs_reinit,		/* vfs_reinit */
+	chfs_done,			/* vfs_done */
+	NULL,				/* vfs_mountroot */
+	chfs_snapshot,		/* vfs_snapshot */
+	vfs_stdextattrctl,		/* vfs_extattrctl */
+	(void *)eopnotsupp,		/* vfs_suspendctl */
+	genfs_renamelock_enter,
+	genfs_renamelock_exit,
+	(void *)eopnotsupp,
+	chfs_vnodeopv_descs,
+	0,				/* vfs_refcount */
+	{ NULL, NULL },
+};
+
+static int
+chfs_modcmd(modcmd_t cmd, void *arg)
+{
+	switch (cmd) {
+	case MODULE_CMD_INIT:
+		return vfs_attach(&chfs_vfsops);
+	case MODULE_CMD_FINI:
+		return vfs_detach(&chfs_vfsops);
+	default:
+		return ENOTTY;
+	}
+}
diff --git a/sys/ufs/chfs/chfs_vnode.c b/sys/ufs/chfs/chfs_vnode.c
new file mode 100644
index 000000000..2e1b386bd
--- /dev/null
+++ b/sys/ufs/chfs/chfs_vnode.c
@@ -0,0 +1,393 @@
+/*	$NetBSD: chfs_vnode.c,v 1.2 2011/11/24 21:09:37 agc Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+#include "chfs_inode.h"
+#include <sys/malloc.h>
+#include <sys/kauth.h>
+#include <sys/namei.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+
+struct vnode *
+chfs_vnode_lookup(struct chfs_mount *chmp, ino_t vno)
+{
+	struct vnode *vp;
+	struct chfs_inode *ip;
+
+	TAILQ_FOREACH(vp, &chmp->chm_fsmp->mnt_vnodelist, v_mntvnodes) {
+		ip = VTOI(vp);
+		if (ip && ip->ino == vno)
+			return vp;
+	}
+	return NULL;
+}
+
+int
+chfs_readvnode(struct mount* mp, ino_t ino, struct vnode** vpp)
+{
+	struct ufsmount* ump = VFSTOUFS(mp);
+	struct chfs_mount *chmp = ump->um_chfs;
+	struct chfs_vnode_cache *chvc;
+	struct chfs_flash_vnode *chfvn;
+	struct chfs_inode *ip;
+	int err;
+	char* buf;
+	size_t retlen, len;
+	struct vnode* vp = NULL;
+	dbg("readvnode | ino: %llu\n", (unsigned long long)ino);
+
+	len = sizeof(struct chfs_flash_vnode);
+
+	KASSERT(vpp != NULL);
+
+	if (vpp != NULL) {
+		vp = *vpp;
+	}
+
+	ip = VTOI(vp);
+	chvc = ip->chvc;
+
+	if (chvc && ino != CHFS_ROOTINO) {
+		/* debug... */
+		printf("readvnode; offset: %" PRIu32 ", lnr: %d\n",
+		    CHFS_GET_OFS(chvc->v->nref_offset), chvc->v->nref_lnr);
+
+		KASSERT((void *)chvc != (void *)chvc->v);
+
+		buf = kmem_alloc(len, KM_SLEEP);
+		err = chfs_read_leb(chmp, chvc->v->nref_lnr, buf,
+		    CHFS_GET_OFS(chvc->v->nref_offset), len, &retlen);
+		if (err)
+			return err;
+		if (retlen != len) {
+			chfs_err("Error reading vnode: read: %zu insted of: %zu\n",
+			    len, retlen);
+			return EIO;
+		}
+		chfvn = (struct chfs_flash_vnode*)buf;
+		chfs_set_vnode_size(vp, chfvn->dn_size);
+		ip->mode = chfvn->mode;
+		vp->v_type = IFTOVT(ip->mode);
+		ip->version = chfvn->version;
+		//ip->chvc->highest_version = ip->version;
+		ip->uid = chfvn->uid;
+		ip->gid = chfvn->gid;
+		ip->atime = chfvn->atime;
+		ip->mtime = chfvn->mtime;
+		ip->ctime = chfvn->ctime;
+		kmem_free(buf, len);
+	}
+
+
+	*vpp = vp;
+	return 0;
+}
+
+int
+chfs_readdirent(struct mount *mp, struct chfs_node_ref *chnr, struct chfs_inode *pdir)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct chfs_mount *chmp = ump->um_chfs;
+	struct chfs_flash_dirent_node chfdn;
+	struct chfs_dirent *fd;//, *pdents;
+	size_t len = sizeof(struct chfs_flash_dirent_node);
+//	struct chfs_vnode_cache* parent;
+	size_t retlen;
+	int err = 0;
+
+//	parent = chfs_get_vnode_cache(chmp, pdir->ino);
+
+	//read flash_dirent_node
+	err = chfs_read_leb(chmp, chnr->nref_lnr, (char *)&chfdn,
+	    CHFS_GET_OFS(chnr->nref_offset), len, &retlen);
+	if (err) {
+		return err;
+	}
+	if (retlen != len) {
+		chfs_err("Error reading vnode: read: %zu insted of: %zu\n",
+		    retlen, len);
+		return EIO;
+	}
+
+	//set fields of dirent
+	fd = chfs_alloc_dirent(chfdn.nsize + 1);
+	fd->version = chfdn.version;
+	fd->vno = chfdn.vno;
+	fd->type = chfdn.dtype;
+	fd->nsize = chfdn.nsize;
+//	fd->next = NULL;
+
+	err = chfs_read_leb(chmp, chnr->nref_lnr, fd->name,
+	    CHFS_GET_OFS(chnr->nref_offset) + len, chfdn.nsize, &retlen);
+	if (err) {
+		return err;
+	}
+
+	if (retlen != chfdn.nsize) {
+		chfs_err("Error reading vnode: read: %zu insted of: %zu\n",
+		    len, retlen);
+		return EIO;
+	}
+
+	fd->name[fd->nsize] = 0;
+	fd->nref = chnr;
+
+	chfs_add_fd_to_inode(chmp, pdir, fd);
+/*
+  pdents = pdir->i_chfs_ext.dents;
+  if (!pdents)
+  pdir->i_chfs_ext.dents = fd;
+  else {
+  while (pdents->next != NULL) {
+  pdents = pdents->next;
+  }
+  pdents->next = fd;
+  }
+*/
+	return 0;
+}
+
+/*
+ * Allocate a new inode.
+ */
+int
+chfs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
+    struct componentname *cnp, int type)
+{
+	struct chfs_inode *ip, *pdir;
+	struct vnode *vp;
+	struct ufsmount* ump = VFSTOUFS(dvp->v_mount);
+	struct chfs_mount* chmp = ump->um_chfs;
+	struct chfs_vnode_cache* chvc;
+	int error, ismember = 0;
+	ino_t vno;
+	struct chfs_dirent *nfd;//, *fd;
+
+	dbg("makeinode\n");
+	pdir = VTOI(dvp);
+
+	*vpp = NULL;
+
+	vno = ++(chmp->chm_max_vno);
+
+	error = VFS_VGET(dvp->v_mount, vno, &vp);
+	if (error)
+		return (error);
+
+	mutex_enter(&chmp->chm_lock_vnocache);
+	chvc = chfs_vnode_cache_get(chmp, vno);
+	mutex_exit(&chmp->chm_lock_vnocache);
+
+	chvc->pvno = pdir->ino;
+	chvc->vno_version = kmem_alloc(sizeof(uint64_t), KM_SLEEP);
+	*(chvc->vno_version) = 1;
+	if (type != VDIR)
+		chvc->nlink = 1;
+	else
+		chvc->nlink = 2;
+//	chfs_vnode_cache_set_state(chmp, chvc, VNO_STATE_CHECKEDABSENT);
+	chvc->state = VNO_STATE_CHECKEDABSENT;
+
+	ip = VTOI(vp);
+	ip->ino = vno;
+
+	if (type == VDIR)
+		chfs_set_vnode_size(vp, 512);
+	else
+		chfs_set_vnode_size(vp, 0);
+
+	ip->uid = kauth_cred_geteuid(cnp->cn_cred);
+	ip->gid = kauth_cred_getegid(cnp->cn_cred);
+	ip->version = 1;
+	ip->iflag |= (IN_ACCESS | IN_CHANGE | IN_UPDATE);
+
+	ip->chvc = chvc;
+	//ip->chvc->highest_version = 1;
+	ip->target = NULL;
+
+	ip->mode = mode;
+	vp->v_type = type;	/* Rest init'd in getnewvnode(). */
+	if ((ip->mode & ISGID) && (kauth_cred_ismember_gid(cnp->cn_cred,
+		ip->gid, &ismember) != 0 || !ismember) &&
+	    kauth_authorize_generic(cnp->cn_cred, KAUTH_GENERIC_ISSUSER, NULL))
+		ip->mode &= ~ISGID;
+
+	chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+
+	mutex_enter(&chmp->chm_lock_mountfields);
+
+	//write inode to flash
+	error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL);
+	if (error) {
+		mutex_exit(&chmp->chm_lock_mountfields);
+		vput(vp);
+		vput(dvp);
+		return error;
+	}
+	//update parent directory and write it to the flash
+	pdir->iflag |= (IN_ACCESS | IN_CHANGE | IN_MODIFY | IN_UPDATE);
+	chfs_update(dvp, NULL, NULL, UPDATE_WAIT);
+
+	error = chfs_write_flash_vnode(chmp, pdir, ALLOC_NORMAL);
+	if (error) {
+		mutex_exit(&chmp->chm_lock_mountfields);
+		vput(vp);
+		vput(dvp);
+		return error;
+	}
+	vput(dvp);
+
+	//set up node's full dirent
+	nfd = chfs_alloc_dirent(cnp->cn_namelen + 1);
+	nfd->vno = ip->ino;
+	nfd->version = (++pdir->chvc->highest_version);
+	nfd->type = type;
+//	nfd->next = NULL;
+	nfd->nsize = cnp->cn_namelen;
+	memcpy(&(nfd->name), cnp->cn_nameptr, cnp->cn_namelen);
+	nfd->name[nfd->nsize] = 0;
+	nfd->nhash = hash32_buf(nfd->name, cnp->cn_namelen, HASH32_BUF_INIT);
+
+	// write out direntry
+	error = chfs_write_flash_dirent(chmp, pdir, ip, nfd, ip->ino, ALLOC_NORMAL);
+	if (error) {
+        mutex_exit(&chmp->chm_lock_mountfields);
+		vput(vp);
+		return error;
+	}
+
+	//TODO set parent's dir times
+
+	chfs_add_fd_to_inode(chmp, pdir, nfd);
+/*
+  fd = pdir->i_chfs_ext.dents;
+  if (!fd)
+  pdir->i_chfs_ext.dents = nfd;
+  else {
+  while (fd->next != NULL) {
+  fd = fd->next;
+  }
+  fd->next = nfd;
+  }
+*/
+	//pdir->i_nlink++;
+	pdir->chvc->nlink++;
+
+	mutex_exit(&chmp->chm_lock_mountfields);
+
+	*vpp = vp;
+	return (0);
+}
+
+void
+chfs_set_vnode_size(struct vnode *vp, size_t size)
+{
+	struct chfs_inode *ip;
+
+	KASSERT(vp != NULL);
+
+	ip = VTOI(vp);
+	KASSERT(ip != NULL);
+
+	ip->size = size;
+	vp->v_size = vp->v_writesize = size;
+	return;
+}
+
+void
+chfs_change_size_free(struct chfs_mount *chmp,
+	struct chfs_eraseblock *cheb, int change)
+{
+	KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+	KASSERT((int)(chmp->chm_free_size + change) >= 0);
+	KASSERT((int)(cheb->free_size + change) >= 0);
+	KASSERT((int)(cheb->free_size + change) <= chmp->chm_ebh->eb_size);
+	chmp->chm_free_size += change;
+	cheb->free_size += change;
+	return;
+}
+
+void
+chfs_change_size_dirty(struct chfs_mount *chmp,
+	struct chfs_eraseblock *cheb, int change)
+{
+	KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+	KASSERT((int)(chmp->chm_dirty_size + change) >= 0);
+	KASSERT((int)(cheb->dirty_size + change) >= 0);
+	KASSERT((int)(cheb->dirty_size + change) <= chmp->chm_ebh->eb_size);
+	chmp->chm_dirty_size += change;
+	cheb->dirty_size += change;
+	return;
+}
+
+void
+chfs_change_size_unchecked(struct chfs_mount *chmp,
+	struct chfs_eraseblock *cheb, int change)
+{
+	KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+	KASSERT((int)(chmp->chm_unchecked_size + change) >= 0);
+	KASSERT((int)(cheb->unchecked_size + change) >= 0);
+	KASSERT((int)(cheb->unchecked_size + change) <= chmp->chm_ebh->eb_size);
+	chmp->chm_unchecked_size += change;
+	cheb->unchecked_size += change;
+	return;
+}
+
+void
+chfs_change_size_used(struct chfs_mount *chmp,
+	struct chfs_eraseblock *cheb, int change)
+{
+	KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+	KASSERT((int)(chmp->chm_used_size + change) >= 0);
+	KASSERT((int)(cheb->used_size + change) >= 0);
+	KASSERT((int)(cheb->used_size + change) <= chmp->chm_ebh->eb_size);
+	chmp->chm_used_size += change;
+	cheb->used_size += change;
+	return;
+}
+
+void
+chfs_change_size_wasted(struct chfs_mount *chmp,
+	struct chfs_eraseblock *cheb, int change)
+{
+	KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+	KASSERT((int)(chmp->chm_wasted_size + change) >= 0);
+	KASSERT((int)(cheb->wasted_size + change) >= 0);
+	KASSERT((int)(cheb->wasted_size + change) <= chmp->chm_ebh->eb_size);
+	chmp->chm_wasted_size += change;
+	cheb->wasted_size += change;
+	return;
+}
+
diff --git a/sys/ufs/chfs/chfs_vnode_cache.c b/sys/ufs/chfs/chfs_vnode_cache.c
new file mode 100644
index 000000000..101b49402
--- /dev/null
+++ b/sys/ufs/chfs/chfs_vnode_cache.c
@@ -0,0 +1,165 @@
+/*	$NetBSD: chfs_vnode_cache.c,v 1.1 2011/11/24 15:51:32 ahoka Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+#include <sys/pool.h>
+
+struct chfs_vnode_cache **
+chfs_vnocache_hash_init(void)
+{
+	return kmem_zalloc(VNODECACHE_SIZE *
+	    sizeof(struct chfs_vnode_cache *), KM_SLEEP);
+}
+
+/**
+ * chfs_set_vnode_cache_state - set state of a vnode_cache
+ * @chmp: fs super block info
+ * @vc: vnode_cache
+ * @state: new state
+ */
+void
+chfs_vnode_cache_set_state(struct chfs_mount *chmp,
+    struct chfs_vnode_cache* vc, int state)
+{
+	/* XXX do we really need locking here? */
+	KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+	vc->state = state;
+}
+
+/**
+ * chfs_get_vnode_cache - get a vnode_cache from the vnocache_hash
+ * @chmp: fs super block info
+ * @ino: inode for search
+ * Returns the vnode_cache.
+ */
+struct chfs_vnode_cache *
+chfs_vnode_cache_get(struct chfs_mount *chmp, ino_t vno)
+{
+	struct chfs_vnode_cache* ret;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+
+	ret = chmp->chm_vnocache_hash[vno % VNODECACHE_SIZE];
+
+	if (ret == NULL) {
+		return NULL;
+	}
+
+	while (ret && ret->vno < vno) {
+		ret = ret->next;
+	}
+
+	if (ret && ret->vno != vno) {
+		ret = NULL;
+	}
+
+	return ret;
+}
+
+/**
+ * chfs_add_vnode_cache - add a vnode_cache to the vnocache_hash
+ * @chmp: fs super block info
+ * @new: new vnode_cache
+ */
+void
+chfs_vnode_cache_add(struct chfs_mount *chmp,
+    struct chfs_vnode_cache* new)
+{
+	struct chfs_vnode_cache** prev;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+
+	if (!new->vno) {
+		new->vno = ++chmp->chm_max_vno;
+	}
+
+	prev = &chmp->chm_vnocache_hash[new->vno % VNODECACHE_SIZE];
+
+	while ((*prev) && (*prev)->vno < new->vno) {
+		prev = &((*prev)->next);
+	}
+	new->next = *prev;
+	*prev = new;
+}
+
+/**
+ * chfs_del_vnode_cache - del a vnode_cache from the vnocache_hash
+ * @chmp: fs super block info
+ * @old: old vnode_cache
+ */
+void
+chfs_vnode_cache_remove(struct chfs_mount *chmp,
+    struct chfs_vnode_cache* old)
+{
+	struct chfs_vnode_cache** prev;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+
+	prev = &chmp->chm_vnocache_hash[old->vno % VNODECACHE_SIZE];
+	while ((*prev) && (*prev)->vno < old->vno) {
+		prev = &(*prev)->next;
+	}
+
+	if ((*prev) == old) {
+		*prev = old->next;
+	}
+
+	if (old->state != VNO_STATE_READING &&
+	    old->state != VNO_STATE_CLEARING) {
+		chfs_vnode_cache_free(old);
+	}
+}
+
+/**
+ * chfs_free_vnode_caches - free the vnocache_hash
+ * @chmp: fs super block info
+ */
+void
+chfs_vnocache_hash_destroy(struct chfs_vnode_cache **hash)
+{
+	struct chfs_vnode_cache *this, *next;
+	int i;
+
+	for (i = 0; i < VNODECACHE_SIZE; i++) {
+		this = hash[i];
+		while (this) {
+			next = this->next;
+			chfs_vnode_cache_free(this);
+			this = next;
+		}
+		hash[i] = NULL;
+	}
+}
+
+
diff --git a/sys/ufs/chfs/chfs_vnops.c b/sys/ufs/chfs/chfs_vnops.c
new file mode 100644
index 000000000..f6a11d93b
--- /dev/null
+++ b/sys/ufs/chfs/chfs_vnops.c
@@ -0,0 +1,1765 @@
+/*	$NetBSD: chfs_vnops.c,v 1.2 2011/11/24 21:09:37 agc Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <uvm/uvm.h>
+#include <sys/namei.h>
+#include <sys/stat.h>
+#include <sys/fcntl.h>
+#include <sys/buf.h>
+#include <sys/fstrans.h>
+#include <sys/vnode.h>
+
+#include "chfs.h"
+
+#define READ_S  "chfs_read"
+
+int
+chfs_lookup(void *v)
+{
+	struct vnode *dvp = ((struct vop_lookup_args *) v)->a_dvp;
+	struct vnode **vpp = ((struct vop_lookup_args *) v)->a_vpp;
+	struct componentname *cnp = ((struct vop_lookup_args *) v)->a_cnp;
+
+	int error;
+	struct chfs_inode* ip;
+	struct ufsmount* ump;
+	struct chfs_mount* chmp;
+	struct chfs_vnode_cache* chvc;
+	struct chfs_dirent* fd;
+
+	dbg("lookup(): %s\n", cnp->cn_nameptr);
+
+	KASSERT(VOP_ISLOCKED(dvp));
+
+	*vpp = NULL;
+
+	// Check accessibility of requested node as a first step.
+	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred);
+	if (error != 0) {
+		goto out;
+	}
+
+	// If requesting the last path component on a read-only file system
+	// with a write operation, deny it.
+	if ((cnp->cn_flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY)
+	    && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+		error = EROFS;
+		goto out;
+	}
+
+	// Avoid doing a linear scan of the directory if the requested
+	// directory/name couple is already in the cache.
+	error = cache_lookup(dvp, vpp, cnp);
+	if (error >= 0) {
+		goto out;
+	}
+
+	ip = VTOI(dvp);
+	ump = VFSTOUFS(dvp->v_mount);
+	chmp = ump->um_chfs;
+	if (ip->ino == 0) {
+		ip->ino = ++chmp->chm_max_vno;
+	}
+	mutex_enter(&chmp->chm_lock_vnocache);
+	chvc = chfs_vnode_cache_get(chmp, ip->ino);
+	mutex_exit(&chmp->chm_lock_vnocache);
+
+	// We cannot be requesting the parent directory of the root node.
+	KASSERT(IMPLIES(dvp->v_type == VDIR && chvc->pvno == chvc->vno,
+		!(cnp->cn_flags & ISDOTDOT)));
+
+	if (cnp->cn_flags & ISDOTDOT) {
+		VOP_UNLOCK(dvp);
+		error = VFS_VGET(dvp->v_mount, ip->chvc->pvno, vpp);
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+	} else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
+		vref(dvp);
+		*vpp = dvp;
+		error = 0;
+	} else {
+		fd = chfs_dir_lookup(ip, cnp);
+
+		if (fd == NULL) {
+			dbg("fd null\n");
+			// The entry was not found in the directory.
+			// This is OK if we are creating or renaming an
+			// entry and are working on the last component of
+			// the path name.
+			if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == CREATE
+				|| cnp->cn_nameiop == RENAME)) {
+				error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred);
+				if (error) {
+					dbg("after the entry was not found in dir\n");
+					goto out;
+				}
+
+				dbg("return EJUSTRETURN\n");
+				error = EJUSTRETURN;
+			} else {
+				error = ENOENT;
+			}
+		} else {
+			// If we are not at the last path component and
+			// found a non-directory or non-link entry (which
+			// may itself be pointing to a directory), raise
+			// an error.
+			if ((fd->type != VDIR && fd->type != VLNK) && !(cnp->cn_flags
+				& ISLASTCN)) {
+				error = ENOTDIR;
+				goto out;
+			}
+
+			dbg("vno@allocating new vnode: %llu\n",
+				(unsigned long long)fd->vno);
+			error = VFS_VGET(dvp->v_mount, fd->vno, vpp);
+		}
+	}
+	// Store the result of this lookup in the cache.  Avoid this if the
+	// request was for creation, as it does not improve timings on
+	// emprical tests.
+	if ((cnp->cn_flags & MAKEENTRY) && cnp->cn_nameiop != CREATE
+	    && (cnp->cn_flags & ISDOTDOT) == 0)
+		cache_enter(dvp, *vpp, cnp);
+
+out:
+	// If there were no errors, *vpp cannot be null and it must be
+	// locked.
+	KASSERT(IFF(error == 0, *vpp != NULL && VOP_ISLOCKED(*vpp)));
+
+	// dvp must always be locked.
+	KASSERT(VOP_ISLOCKED(dvp));
+
+	return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_create(void *v)
+{
+	struct vop_create_args /* {
+				  struct vnode *a_dvp;
+				  struct vnode **a_vpp;
+				  struct componentname *a_cnp;
+				  struct vattr *a_vap;
+				  } */*ap = v;
+	int error, mode;
+	dbg("create()\n");
+
+	mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode);
+
+	if ((mode & IFMT) == 0) {
+		if (ap->a_vap->va_type == VREG)
+			mode |= IFREG;
+		if (ap->a_vap->va_type == VSOCK)
+			mode |= IFSOCK;
+	}
+
+	error = chfs_makeinode(mode, ap->a_dvp,	ap->a_vpp, ap->a_cnp, ap->a_vap->va_type);
+
+	if (error) {
+		dbg("error: %d\n", error);
+		return error;
+	}
+
+	VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+	return 0;
+}
+/* --------------------------------------------------------------------- */
+
+int
+chfs_mknod(void *v)
+{
+	struct vnode *dvp = ((struct vop_mknod_args *) v)->a_dvp;
+	struct vnode **vpp = ((struct vop_mknod_args *) v)->a_vpp;
+	struct componentname *cnp = ((struct vop_mknod_args *) v)->a_cnp;
+	struct vattr *vap = ((struct vop_mknod_args *) v)->a_vap;
+	int mode, err = 0;
+	struct chfs_inode *ip;
+	struct vnode *vp;
+
+	struct ufsmount *ump;
+	struct chfs_mount *chmp;
+	ino_t ino;
+
+	struct chfs_full_dnode *fd;
+	struct buf *bp;
+	int len;
+	dbg("mknod()\n");
+
+	ump = VFSTOUFS(dvp->v_mount);
+	chmp = ump->um_chfs;
+
+	if (vap->va_type != VBLK && vap->va_type != VCHR && vap->va_type != VFIFO)
+		return EINVAL;
+
+	vp = *vpp;
+
+	mode = MAKEIMODE(vap->va_type, vap->va_mode);
+
+	if ((mode & IFMT) == 0) {
+		switch (vap->va_type) {
+		case VBLK:
+			mode |= IFBLK;
+			break;
+		case VCHR:
+			mode |= IFCHR;
+			break;
+		case VFIFO:
+			mode |= IFIFO;
+			break;
+		default:
+			break;
+		}
+	}
+
+	err = chfs_makeinode(mode, dvp, &vp, cnp, vap->va_type);
+
+	ip = VTOI(vp);
+	ino = ip->ino;
+	if (vap->va_rdev != VNOVAL)
+		ip->rdev = vap->va_rdev;
+
+	if (vap->va_type == VFIFO)
+		vp->v_op = chfs_fifoop_p;
+	else {
+		vp->v_op = chfs_specop_p;
+		spec_node_init(vp, ip->rdev);
+	}
+
+	if (err)
+		return err;
+
+	len = sizeof(dev_t);
+	chfs_set_vnode_size(vp, len);
+	bp = getiobuf(vp, true);
+	bp->b_bufsize = bp->b_resid = len;
+	bp->b_data = kmem_alloc(len, KM_SLEEP);
+	memcpy(bp->b_data, &ip->rdev, len);
+	bp->b_blkno = 0;
+
+	fd = chfs_alloc_full_dnode();
+
+	mutex_enter(&chmp->chm_lock_mountfields);
+
+	err = chfs_write_flash_dnode(chmp, vp, bp, fd);
+	if (err) {
+		mutex_exit(&chmp->chm_lock_mountfields);
+		kmem_free(bp->b_data, len);
+		return err;
+	}
+
+	err = chfs_add_full_dnode_to_inode(chmp, ip, fd);
+	if (err) {
+		mutex_exit(&chmp->chm_lock_mountfields);
+		kmem_free(bp->b_data, len);
+		return err;
+	}
+
+	mutex_exit(&chmp->chm_lock_mountfields);
+
+	*vpp = vp;
+	kmem_free(bp->b_data, len);
+	putiobuf(bp);
+
+	return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_open(void *v)
+{
+	struct vnode *vp = ((struct vop_open_args *) v)->a_vp;
+	int mode = ((struct vop_open_args *) v)->a_mode;
+	dbg("open()\n");
+
+	int error;
+	struct chfs_inode *ip;
+
+	KASSERT(VOP_ISLOCKED(vp));
+
+	ip = VTOI(vp);
+
+	KASSERT(vp->v_size == ip->size);
+	if (ip->chvc->nlink < 1) {
+		error = ENOENT;
+		goto out;
+	}
+
+	// If the file is marked append-only, deny write requests.
+	if (ip->flags & APPEND && (mode & (FWRITE | O_APPEND)) == FWRITE)
+		error = EPERM;
+	else
+		error = 0;
+
+out:
+	KASSERT(VOP_ISLOCKED(vp));
+	return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_close(void *v)
+{
+	struct vnode *vp = ((struct vop_close_args *) v)->a_vp;
+	dbg("close()\n");
+
+	struct chfs_inode *ip;
+
+	KASSERT(VOP_ISLOCKED(vp));
+
+	ip = VTOI(vp);
+
+	if (ip->chvc->nlink > 0) {
+		//ip->chvc->nlink = 0;
+		chfs_update(vp, NULL, NULL, UPDATE_CLOSE);
+	}
+
+	return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_access(void *v)
+{
+	struct vnode *vp = ((struct vop_access_args *) v)->a_vp;
+	int mode = ((struct vop_access_args *) v)->a_mode;
+	kauth_cred_t cred = ((struct vop_access_args *) v)->a_cred;
+
+	dbg("access()\n");
+	struct chfs_inode *ip = VTOI(vp);
+
+	if (mode & VWRITE) {
+		switch (vp->v_type) {
+		case VLNK:
+		case VDIR:
+		case VREG:
+			if (vp->v_mount->mnt_flag & MNT_RDONLY)
+				return (EROFS);
+			break;
+		case VBLK:
+		case VCHR:
+		case VSOCK:
+		case VFIFO:
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (mode & VWRITE && ip->flags & IMMUTABLE)
+		return (EPERM);
+
+	return genfs_can_access(vp->v_type, ip->mode & ALLPERMS,
+	    ip->uid, ip->gid, mode, cred);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_getattr(void *v)
+{
+	struct vnode *vp = ((struct vop_getattr_args *) v)->a_vp;
+	struct vattr *vap = ((struct vop_getattr_args *) v)->a_vap;
+
+	struct chfs_inode *ip = VTOI(vp);
+	dbg("getattr()\n");
+
+	KASSERT(vp->v_size == ip->size);
+
+	vattr_null(vap);
+	CHFS_ITIMES(ip, NULL, NULL, NULL);
+
+	vap->va_type = vp->v_type;
+	vap->va_mode = ip->mode & ALLPERMS;
+	vap->va_nlink = ip->chvc->nlink;
+	vap->va_uid = ip->uid;
+	vap->va_gid = ip->gid;
+	vap->va_fsid = ip->dev;
+	vap->va_fileid = ip->ino;
+	vap->va_size = ip->size;
+	vap->va_blocksize = PAGE_SIZE;
+	vap->va_atime.tv_sec = ip->atime;
+	vap->va_atime.tv_nsec = 0;
+	vap->va_mtime.tv_sec = ip->mtime;
+	vap->va_mtime.tv_nsec = 0;
+	vap->va_ctime.tv_sec = ip->ctime;
+	vap->va_ctime.tv_nsec = 0;
+	vap->va_gen = ip->version;
+	vap->va_flags = ip->flags;
+	vap->va_rdev = ip->rdev;
+	vap->va_bytes = round_page(ip->size);
+	vap->va_filerev = VNOVAL;
+	vap->va_vaflags = 0;
+	vap->va_spare = VNOVAL;
+
+	return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+/* Note: modelled after tmpfs's same function */
+
+int
+chfs_setattr(void *v)
+{
+	struct vnode *vp = ((struct vop_setattr_args *) v)->a_vp;
+	struct vattr *vap = ((struct vop_setattr_args *) v)->a_vap;
+	kauth_cred_t cred = ((struct vop_setattr_args *) v)->a_cred;
+
+	struct chfs_inode *ip;
+	struct ufsmount *ump = VFSTOUFS(vp->v_mount);
+	struct chfs_mount *chmp = ump->um_chfs;
+	int error = 0;
+
+	dbg("setattr()\n");
+
+	KASSERT(VOP_ISLOCKED(vp));
+	ip = VTOI(vp);
+
+	/* Abort if any unsettable attribute is given. */
+	if (vap->va_type != VNON || vap->va_nlink != VNOVAL ||
+	    vap->va_fsid != VNOVAL || vap->va_fileid != VNOVAL ||
+	    vap->va_blocksize != VNOVAL /*|| GOODTIME(&vap->va_ctime)*/ ||
+	    vap->va_gen != VNOVAL || vap->va_rdev != VNOVAL ||
+	    vap->va_bytes != VNOVAL) {
+		return EINVAL;
+	}
+
+	if (error == 0 && (vap->va_flags != VNOVAL))
+		error = chfs_chflags(vp, vap->va_flags, cred);
+
+	if (error == 0 && (vap->va_size != VNOVAL))
+		error = chfs_chsize(vp, vap->va_size, cred);
+
+	if (error == 0 && (vap->va_uid != VNOVAL || vap->va_gid != VNOVAL))
+		error = chfs_chown(vp, vap->va_uid, vap->va_gid, cred);
+
+	if (error == 0 && (vap->va_mode != VNOVAL))
+		error = chfs_chmod(vp, vap->va_mode, cred);
+
+#if 0
+	/* why do we need that? */
+	if (ip->flags & (IMMUTABLE | APPEND))
+		return EPERM;
+#endif
+
+	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
+		error = genfs_can_chtimes(vp, vap->va_vaflags, ip->uid, cred);
+		if (error)
+			return error;
+		if (vap->va_atime.tv_sec != VNOVAL)
+			ip->iflag |= IN_ACCESS;
+		if (vap->va_mtime.tv_sec != VNOVAL)
+			ip->iflag |= IN_CHANGE | IN_UPDATE;
+		error = chfs_update(vp,
+		    &vap->va_atime, &vap->va_mtime, UPDATE_WAIT);
+		if (error)
+			return error;
+	}
+
+	mutex_enter(&chmp->chm_lock_mountfields);
+	error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL);
+	mutex_exit(&chmp->chm_lock_mountfields);
+
+	return error;
+}
+
+int
+chfs_chmod(struct vnode *vp, int mode, kauth_cred_t cred)
+{
+	struct chfs_inode *ip = VTOI(vp);
+	int error;
+	dbg("chmod\n");
+
+	error = genfs_can_chmod(vp, cred, ip->uid, ip->gid, mode);
+	if (error)
+		return error;
+	ip->mode &= ~ALLPERMS;
+	ip->mode |= (mode & ALLPERMS);
+	ip->iflag |= IN_CHANGE;
+
+	error = chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+	if (error)
+		return error;
+
+	return 0;
+}
+
+int
+chfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred)
+{
+	struct chfs_inode *ip = VTOI(vp);
+	int error;
+	dbg("chown\n");
+
+	if (uid == (uid_t)VNOVAL)
+		uid = ip->uid;
+	if (gid == (gid_t)VNOVAL)
+		gid = ip->gid;
+
+	error = genfs_can_chown(vp, cred, ip->uid, ip->gid, uid, gid);
+	if (error)
+		return error;
+
+	ip->gid = gid;
+	ip->uid = uid;
+	ip->iflag |= IN_CHANGE;
+
+	error = chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+	if (error)
+		return error;
+
+	return 0;
+}
+
+
+/* --------------------------------------------------------------------- */
+/* calculates ((off_t)blk * chmp->chm_chm_fs_bsize) */
+#define	lblktosize(chmp, blk)						      \
+	(((off_t)(blk)) << (chmp)->chm_fs_bshift)
+
+/* calculates (loc % chmp->chm_chm_fs_bsize) */
+#define	blkoff(chmp, loc)							      \
+	((loc) & (chmp)->chm_fs_qbmask)
+
+/* calculates (loc / chmp->chm_chm_fs_bsize) */
+#define	lblkno(chmp, loc)							      \
+	((loc) >> (chmp)->chm_fs_bshift)
+
+/* calculates roundup(size, chmp->chm_chm_fs_fsize) */
+#define	fragroundup(chmp, size)						      \
+	(((size) + (chmp)->chm_fs_qfmask) & (chmp)->chm_fs_fmask)
+
+#define	blksize(chmp, ip, lbn)						      \
+	(((lbn) >= NDADDR || (ip)->size >= lblktosize(chmp, (lbn) + 1))	      \
+	    ? (chmp)->chm_fs_bsize					      \
+	    : (fragroundup(chmp, blkoff(chmp, (ip)->size))))
+
+/* calculates roundup(size, chmp->chm_chm_fs_bsize) */
+#define	blkroundup(chmp, size)						      \
+ 	(((size) + (chmp)->chm_fs_qbmask) & (chmp)->chm_fs_bmask)
+
+int
+chfs_read(void *v)
+{
+	struct vop_read_args /* {
+				struct vnode *a_vp;
+				struct uio *a_uio;
+				int a_ioflag;
+				kauth_cred_t a_cred;
+				} */ *ap = v;
+	struct vnode *vp;
+	struct chfs_inode *ip;
+	struct uio *uio;
+	struct ufsmount *ump;
+	struct buf *bp;
+	struct chfs_mount *chmp;
+	daddr_t lbn, nextlbn;
+	off_t bytesinfile;
+	long size, xfersize, blkoffset;
+	int error, ioflag;
+	vsize_t bytelen;
+	bool usepc = false;
+
+	dbg("chfs_read\n");
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	ump = ip->ump;
+	uio = ap->a_uio;
+	ioflag = ap->a_ioflag;
+	error = 0;
+
+	dbg("ip->size:%llu\n", (unsigned long long)ip->size);
+
+#ifdef DIAGNOSTIC
+	if (uio->uio_rw != UIO_READ)
+		panic("%s: mode", READ_S);
+
+	if (vp->v_type == VLNK) {
+		if (ip->size < ump->um_maxsymlinklen)
+			panic("%s: short symlink", READ_S);
+	} else if (vp->v_type != VREG && vp->v_type != VDIR)
+		panic("%s: type %d", READ_S, vp->v_type);
+#endif
+	chmp = ip->chmp;
+	if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize)
+		return (EFBIG);
+	if (uio->uio_resid == 0)
+		return (0);
+
+	fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+	if (uio->uio_offset >= ip->size)
+		goto out;
+
+	usepc = vp->v_type == VREG;
+	bytelen = 0;
+	if (usepc) {
+		const int advice = IO_ADV_DECODE(ap->a_ioflag);
+
+		while (uio->uio_resid > 0) {
+			if (ioflag & IO_DIRECT) {
+				genfs_directio(vp, uio, ioflag);
+			}
+			bytelen = MIN(ip->size - uio->uio_offset,
+			    uio->uio_resid);
+			if (bytelen == 0)
+				break;
+			error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
+			    UBC_READ | UBC_PARTIALOK |
+			    (UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0));
+			if (error)
+				break;
+
+		}
+		goto out;
+	}
+
+
+	dbg("start reading\n");
+	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
+		bytesinfile = ip->size - uio->uio_offset;
+		if (bytesinfile <= 0)
+			break;
+		lbn = lblkno(chmp, uio->uio_offset);
+		nextlbn = lbn + 1;
+		size = blksize(chmp, ip, lbn);
+		blkoffset = blkoff(chmp, uio->uio_offset);
+		xfersize = MIN(MIN(chmp->chm_fs_bsize - blkoffset, uio->uio_resid),
+		    bytesinfile);
+
+		if (lblktosize(chmp, nextlbn) >= ip->size) {
+			error = bread(vp, lbn, size, NOCRED, 0, &bp);
+			dbg("after bread\n");
+		} else {
+			int nextsize = blksize(chmp, ip, nextlbn);
+			dbg("size: %ld\n", size);
+			error = breadn(vp, lbn,
+			    size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
+			dbg("after breadN\n");
+		}
+		if (error)
+			break;
+
+		/*
+		 * We should only get non-zero b_resid when an I/O error
+		 * has occurred, which should cause us to break above.
+		 * However, if the short read did not cause an error,
+		 * then we want to ensure that we do not uiomove bad
+		 * or uninitialized data.
+		 */
+		size -= bp->b_resid;
+		if (size < xfersize) {
+			if (size == 0)
+				break;
+			xfersize = size;
+		}
+		dbg("uiomove\n");
+		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+		if (error)
+			break;
+		brelse(bp, 0);
+	}
+	if (bp != NULL)
+		brelse(bp, 0);
+
+out:
+	if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
+		ip->iflag |= IN_ACCESS;
+		if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) {
+			//error = UFS_WAPBL_BEGIN(vp->v_mount);
+			if (error) {
+				fstrans_done(vp->v_mount);
+				return error;
+			}
+			error = chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+			//UFS_WAPBL_END(vp->v_mount);
+		}
+	}
+
+	dbg("[END]\n");
+	fstrans_done(vp->v_mount);
+	return (error);
+}
+
+
+/* --------------------------------------------------------------------- */
+
+/*from ffs write*/
+int
+chfs_write(void *v)
+{
+	struct vop_write_args /* {
+				 struct vnode *a_vp;
+				 struct uio *a_uio;
+				 int a_ioflag;
+				 kauth_cred_t a_cred;
+				 } */ *ap = v;
+	struct vnode *vp ;
+	struct uio *uio;
+	struct chfs_inode *ip;
+	struct chfs_mount *chmp;
+	struct lwp *l;
+	kauth_cred_t cred;
+	off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
+	int blkoffset, error, flags, ioflag, resid;
+	int aflag;
+	int extended=0;
+	vsize_t bytelen;
+	bool async;
+	struct ufsmount *ump;
+
+
+	cred = ap->a_cred;
+	ioflag = ap->a_ioflag;
+	uio = ap->a_uio;
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	//dbg("file size (vp): %llu\n", (unsigned long long)vp->v_size);
+	//dbg("file size (ip): %llu\n", (unsigned long long)ip->i_size);
+	ump = ip->ump;
+
+	//dbg("uio->resid: %d\n", uio->uio_resid);
+	dbg("write\n");
+
+	KASSERT(vp->v_size == ip->size);
+
+	switch (vp->v_type) {
+	case VREG:
+		if (ioflag & IO_APPEND)
+			uio->uio_offset = ip->size;
+		if ((ip->flags & APPEND) && uio->uio_offset != ip->size)
+			return (EPERM);
+		/* FALLTHROUGH */
+	case VLNK:
+		break;
+	case VDIR:
+		if ((ioflag & IO_SYNC) == 0)
+			panic("chfs_write: nonsync dir write");
+		break;
+	default:
+		panic("chfs_write: type");
+	}
+
+	chmp = ip->chmp;
+	if (uio->uio_offset < 0 ||
+	    (u_int64_t)uio->uio_offset +
+	    uio->uio_resid > ump->um_maxfilesize) {
+		dbg("uio->uio_offset = %lld | uio->uio_offset + "
+		    "uio->uio_resid (%llu) > ump->um_maxfilesize (%lld)\n",
+		    (long long)uio->uio_offset,
+		    (uint64_t)uio->uio_offset + uio->uio_resid,
+		    (long long)ump->um_maxfilesize);
+		return (EFBIG);
+	}
+	/*
+	 * Maybe this should be above the vnode op call, but so long as
+	 * file servers have no limits, I don't think it matters.
+	 */
+	l = curlwp;
+	if (vp->v_type == VREG && l &&
+	    uio->uio_offset + uio->uio_resid >
+	    l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
+		mutex_enter(proc_lock);
+		psignal(l->l_proc, SIGXFSZ);
+		mutex_exit(proc_lock);
+		return (EFBIG);
+	}
+	if (uio->uio_resid == 0)
+		return (0);
+
+	//mutex_enter(&ip->inode_lock);
+	fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+	flags = ioflag & IO_SYNC ? B_SYNC : 0;
+	async = vp->v_mount->mnt_flag & MNT_ASYNC;
+	origoff = uio->uio_offset;
+	resid = uio->uio_resid;
+	osize = ip->size;
+	error = 0;
+
+
+	/*if ((ioflag & IO_JOURNALLOCKED) == 0) {
+	  error = UFS_WAPBL_BEGIN(vp->v_mount);
+	  if (error) {
+	  fstrans_done(vp->v_mount);
+	  return error;
+	  }
+	  }*/
+
+	preallocoff = round_page(blkroundup(chmp,
+		MAX(osize, uio->uio_offset)));
+	aflag = ioflag & IO_SYNC ? B_SYNC : 0;
+	nsize = MAX(osize, uio->uio_offset + uio->uio_resid);
+	endallocoff = nsize - blkoff(chmp, nsize);
+
+	/*
+	 * if we're increasing the file size, deal with expanding
+	 * the fragment if there is one.
+	 */
+
+	if (nsize > osize && lblkno(chmp, osize) < NDADDR &&
+	    lblkno(chmp, osize) != lblkno(chmp, nsize) &&
+	    blkroundup(chmp, osize) != osize) {
+		off_t eob;
+
+		eob = blkroundup(chmp, osize);
+		uvm_vnp_setwritesize(vp, eob);
+		error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag);
+		if (error)
+			goto out;
+		if (flags & B_SYNC) {
+			mutex_enter(vp->v_interlock);
+			VOP_PUTPAGES(vp,
+			    trunc_page(osize & chmp->chm_fs_bmask),
+			    round_page(eob),
+			    PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
+		}
+	}
+
+	while (uio->uio_resid > 0) {
+		int ubc_flags = UBC_WRITE;
+		bool overwrite; /* if we're overwrite a whole block */
+		off_t newoff;
+
+		if (ioflag & IO_DIRECT) {
+			genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED);
+		}
+
+		oldoff = uio->uio_offset;
+		blkoffset = blkoff(chmp, uio->uio_offset);
+		bytelen = MIN(chmp->chm_fs_bsize - blkoffset, uio->uio_resid);
+		if (bytelen == 0) {
+			break;
+		}
+
+		/*
+		 * if we're filling in a hole, allocate the blocks now and
+		 * initialize the pages first.  if we're extending the file,
+		 * we can safely allocate blocks without initializing pages
+		 * since the new blocks will be inaccessible until the write
+		 * is complete.
+		 */
+		overwrite = uio->uio_offset >= preallocoff &&
+		    uio->uio_offset < endallocoff;
+		if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 &&
+		    blkoff(chmp, uio->uio_offset) == 0 &&
+		    (uio->uio_offset & PAGE_MASK) == 0) {
+			vsize_t len;
+
+			len = trunc_page(bytelen);
+			len -= blkoff(chmp, len);
+			if (len > 0) {
+				overwrite = true;
+				bytelen = len;
+			}
+		}
+
+		newoff = oldoff + bytelen;
+		if (vp->v_size < newoff) {
+			uvm_vnp_setwritesize(vp, newoff);
+		}
+
+		if (!overwrite) {
+			error = ufs_balloc_range(vp, uio->uio_offset, bytelen,
+			    cred, aflag);
+			if (error)
+				break;
+		} else {
+			genfs_node_wrlock(vp);
+			error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
+			    aflag, cred);
+			genfs_node_unlock(vp);
+			if (error)
+				break;
+			ubc_flags |= UBC_FAULTBUSY;
+		}
+
+		/*
+		 * copy the data.
+		 */
+
+		ubc_flags |= UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0;
+		error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
+		    IO_ADV_DECODE(ioflag), ubc_flags);
+
+		/*
+		 * update UVM's notion of the size now that we've
+		 * copied the data into the vnode's pages.
+		 *
+		 * we should update the size even when uiomove failed.
+		 */
+
+		if (vp->v_size < newoff) {
+			uvm_vnp_setsize(vp, newoff);
+			extended = 1;
+		}
+
+		if (error)
+			break;
+
+		/*
+		 * flush what we just wrote if necessary.
+		 * XXXUBC simplistic async flushing.
+		 */
+
+		if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
+			mutex_enter(vp->v_interlock);
+			error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
+			    (uio->uio_offset >> 16) << 16,
+			    PGO_CLEANIT | PGO_JOURNALLOCKED);
+			if (error)
+				break;
+		}
+	}
+out:
+	if (error == 0 && ioflag & IO_SYNC) {
+		mutex_enter(vp->v_interlock);
+		error = VOP_PUTPAGES(vp,
+		    trunc_page(origoff & chmp->chm_fs_bmask),
+		    round_page(blkroundup(chmp, uio->uio_offset)),
+		    PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
+	}
+	ip->iflag |= IN_CHANGE | IN_UPDATE;
+	if (resid > uio->uio_resid && ap->a_cred &&
+	    kauth_authorize_generic(ap->a_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
+		ip->mode &= ~(ISUID | ISGID);
+	}
+	if (resid > uio->uio_resid)
+		VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
+	if (error) {
+		(void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred);
+		uio->uio_offset -= resid - uio->uio_resid;
+		uio->uio_resid = resid;
+	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
+		error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
+
+	//XXX hack, i write the next line after i know ip->i_size and vp->v_size don't equal
+	chfs_set_vnode_size(vp, vp->v_size);
+
+
+	//dbg("end file size (vp): %llu\n", (unsigned long long)vp->v_size);
+	//dbg("end file size (ip): %llu\n", (unsigned long long)ip->i_size);
+	KASSERT(vp->v_size == ip->size);
+	fstrans_done(vp->v_mount);
+
+	mutex_enter(&chmp->chm_lock_mountfields);
+	error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL);
+	mutex_exit(&chmp->chm_lock_mountfields);
+
+	//mutex_exit(&ip->inode_lock);
+	//dbg("end\n");
+	return (error);
+}
+
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_fsync(void *v)
+{
+	//dbg("fsync\n");
+	struct vop_fsync_args /* {
+				 struct vnode *a_vp;
+				 kauth_cred_t a_cred;
+				 int a_flags;
+				 off_t offlo;
+				 off_t offhi;
+				 } */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	int wait;
+
+	if (ap->a_flags & FSYNC_CACHE) {
+		return ENODEV;
+	}
+	wait = (ap->a_flags & FSYNC_WAIT) != 0;
+ 	vflushbuf(vp, wait);
+	//struct chfs_inode *ip = VTOI(vp);
+	//chfs_set_vnode_size(vp, ip->write_size);
+
+	return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_remove(void *v)
+{
+	struct vnode *dvp = ((struct vop_remove_args *) v)->a_dvp;
+	struct vnode *vp = ((struct vop_remove_args *) v)->a_vp;
+	struct componentname *cnp = (((struct vop_remove_args *) v)->a_cnp);
+	dbg("remove\n");
+
+	KASSERT(VOP_ISLOCKED(dvp));
+	KASSERT(VOP_ISLOCKED(vp));
+
+	struct chfs_inode *ip = VTOI(vp);
+	struct chfs_inode *parent = VTOI(dvp);
+	int error = 0;
+
+	KASSERT(ip->chvc->vno != ip->chvc->pvno);
+
+	error = chfs_do_unlink(ip,
+	    parent, cnp->cn_nameptr, cnp->cn_namelen);
+
+	vput(dvp);
+	vput(vp);
+
+	return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_link(void *v)
+{
+	struct vnode *dvp = ((struct vop_link_args *) v)->a_dvp;
+	struct vnode *vp = ((struct vop_link_args *) v)->a_vp;
+	struct componentname *cnp = ((struct vop_link_args *) v)->a_cnp;
+
+	struct chfs_inode *ip, *parent;
+	int error = 0;
+
+	if (vp->v_type == VDIR) {
+		VOP_ABORTOP(dvp, cnp);
+		error = EISDIR;
+		goto out;
+	}
+	if (dvp->v_mount != vp->v_mount) {
+		VOP_ABORTOP(dvp, cnp);
+		error = EXDEV;
+		goto out;
+	}
+	if (dvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE))) {
+		VOP_ABORTOP(dvp, cnp);
+		goto out;
+	}
+
+	parent = VTOI(dvp);
+	ip = VTOI(vp);
+
+	error = chfs_do_link(ip,
+	    parent, cnp->cn_nameptr, cnp->cn_namelen, vp->v_type);
+
+	if (dvp != vp)
+		VOP_UNLOCK(vp);
+out:
+	vput(dvp);
+	return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_rename(void *v)
+{
+	struct vnode *fdvp = ((struct vop_rename_args *) v)->a_fdvp;
+	struct vnode *fvp = ((struct vop_rename_args *) v)->a_fvp;
+	struct componentname *fcnp = ((struct vop_rename_args *) v)->a_fcnp;
+	struct vnode *tdvp = ((struct vop_rename_args *) v)->a_tdvp;
+	struct vnode *tvp = ((struct vop_rename_args *) v)->a_tvp;
+	struct componentname *tcnp = ((struct vop_rename_args *) v)->a_tcnp;
+
+	struct chfs_inode *oldparent, *old;
+	struct chfs_inode *newparent;
+	struct chfs_dirent *fd;//, *oldfd;
+	struct chfs_inode *ip;
+	int error = 0;
+	dbg("rename\n");
+
+	KASSERT(VOP_ISLOCKED(tdvp));
+	KASSERT(IMPLIES(tvp != NULL, VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
+
+	oldparent = VTOI(fdvp);
+	old = VTOI(fvp);
+	newparent = VTOI(tdvp);
+	if (tvp) {
+		dbg("tvp not null\n");
+		ip = VTOI(tvp);
+		if (tvp->v_type == VDIR) {
+			//TODO: lock
+//			fd = ip->dents;
+//			while (fd) {
+			TAILQ_FOREACH(fd, &ip->dents, fds) {
+				if (fd->vno) {
+					//TODO: unlock
+					error = ENOTEMPTY;
+					goto out_unlocked;
+				}
+//				fd = fd->next;
+			}
+			//TODO: unlock
+		}
+		error = chfs_do_unlink(ip,
+		    newparent, tcnp->cn_nameptr, tcnp->cn_namelen);
+		vput(tvp);
+	}
+	VFS_VGET(tdvp->v_mount, old->ino, &tvp);
+	ip = VTOI(tvp);
+
+//	for (oldfd = oldparent->dents;
+//	     oldfd->vno != old->ino;
+//	     oldfd = oldfd->next);
+
+	error = chfs_do_link(ip,
+	    newparent, tcnp->cn_nameptr, tcnp->cn_namelen, tvp->v_type);
+	error = chfs_do_unlink(old,
+	    oldparent, fcnp->cn_nameptr, fcnp->cn_namelen);
+
+//out:
+//	if (fchnode != tchnode)
+//	VOP_UNLOCK(fdvp, 0);
+
+out_unlocked:
+	// Release target nodes.
+	if (tdvp == tvp)
+		vrele(tdvp);
+	else
+		vput(tdvp);
+	if (tvp != NULL)
+		vput(tvp);
+
+	// Release source nodes.
+	vrele(fdvp);
+	vrele(fvp);
+
+	return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_mkdir(void *v)
+{
+	struct vnode *dvp = ((struct vop_mkdir_args *) v)->a_dvp;
+	struct vnode **vpp = ((struct vop_mkdir_args *)v)->a_vpp;
+	struct componentname *cnp = ((struct vop_mkdir_args *) v)->a_cnp;
+	struct vattr *vap = ((struct vop_mkdir_args *) v)->a_vap;
+	dbg("mkdir()\n");
+
+	int mode;
+
+	mode = vap->va_mode & ACCESSPERMS;
+	if ((mode & IFMT) == 0) {
+		mode |= IFDIR;
+	}
+
+	KASSERT(vap->va_type == VDIR);
+
+	return chfs_makeinode(mode, dvp, vpp, cnp, VDIR);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_rmdir(void *v)
+{
+	struct vnode *dvp = ((struct vop_rmdir_args *) v)->a_dvp;
+	struct vnode *vp = ((struct vop_rmdir_args *) v)->a_vp;
+	struct componentname *cnp = ((struct vop_rmdir_args *) v)->a_cnp;
+	dbg("rmdir()\n");
+
+	KASSERT(VOP_ISLOCKED(dvp));
+	KASSERT(VOP_ISLOCKED(vp));
+
+	struct chfs_inode *ip = VTOI(vp);
+	struct chfs_inode *parent = VTOI(dvp);
+	struct chfs_dirent *fd;
+	int error = 0;
+
+	if (vp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto out;
+	}
+
+	KASSERT(ip->chvc->vno != ip->chvc->pvno);
+
+//	for (fd = ip->dents; fd; fd = fd->next) {
+	TAILQ_FOREACH(fd, &ip->dents, fds) {
+		if (fd->vno) {
+			error = ENOTEMPTY;
+			goto out;
+		}
+	}
+
+	error = chfs_do_unlink(ip,
+	    parent, cnp->cn_nameptr, cnp->cn_namelen);
+
+out:
+	vput(dvp);
+	vput(vp);
+
+	return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_symlink(void *v)
+{
+	struct vnode *dvp = ((struct vop_symlink_args *) v)->a_dvp;
+	struct vnode **vpp = ((struct vop_symlink_args *) v)->a_vpp;
+	struct componentname *cnp = ((struct vop_symlink_args *) v)->a_cnp;
+	struct vattr *vap = ((struct vop_symlink_args *) v)->a_vap;
+	char *target = ((struct vop_symlink_args *) v)->a_target;
+
+	struct ufsmount *ump;
+	struct chfs_mount *chmp;
+	struct vnode *vp;
+	struct chfs_inode *ip;
+	int len, err;
+	struct chfs_full_dnode *fd;
+	struct buf *bp;
+	dbg("symlink()\n");
+
+	ump = VFSTOUFS(dvp->v_mount);
+	chmp = ump->um_chfs;
+
+	err = chfs_makeinode(IFLNK | vap->va_mode, dvp, vpp, cnp, VLNK);
+	if (err)
+		return (err);
+	VN_KNOTE(dvp, NOTE_WRITE);
+	vp = *vpp;
+	len = strlen(target);
+	ip = VTOI(vp);
+	/* TODO max symlink len instead of "100" */
+	if (len < 100) {
+		ip->target = kmem_alloc(len, KM_SLEEP);
+		memcpy(ip->target, target, len);
+		chfs_set_vnode_size(vp, len);
+		ip->iflag |= IN_CHANGE | IN_UPDATE;
+
+		bp = getiobuf(vp, true);
+		bp->b_bufsize = bp->b_resid = len;
+		bp->b_data = kmem_alloc(len, KM_SLEEP);
+		memcpy(bp->b_data, target, len);
+		bp->b_blkno = 0;
+
+		fd = chfs_alloc_full_dnode();
+
+		mutex_enter(&chmp->chm_lock_mountfields);
+
+		err = chfs_write_flash_dnode(chmp, vp, bp, fd);
+		if (err) {
+			mutex_exit(&chmp->chm_lock_mountfields);
+			goto out;
+		}
+
+		err = chfs_add_full_dnode_to_inode(chmp, ip, fd);
+		if (err) {
+			mutex_exit(&chmp->chm_lock_mountfields);
+			goto out;
+		}
+
+		mutex_exit(&chmp->chm_lock_mountfields);
+
+		kmem_free(bp->b_data, len);
+		putiobuf(bp);
+
+		uvm_vnp_setsize(vp, len);
+	} else {
+		err = vn_rdwr(UIO_WRITE, vp, target, len, (off_t)0,
+		    UIO_SYSSPACE, IO_NODELOCKED, cnp->cn_cred,
+		    (size_t *)0, NULL);
+	}
+
+out:
+	if (err)
+		vput(vp);
+
+	return (err);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_readdir(void *v)
+{
+	struct vnode *vp = ((struct vop_readdir_args *) v)->a_vp;
+	struct uio *uio = ((struct vop_readdir_args *) v)->a_uio;
+	int *eofflag = ((struct vop_readdir_args *) v)->a_eofflag;
+
+	int error = 0;
+	off_t skip, offset;
+	struct chfs_inode *ip;
+	struct chfs_dirent *fd;
+
+	struct ufsmount *ump;
+	struct chfs_mount *chmp;
+	struct chfs_vnode_cache *chvc;
+
+	KASSERT(VOP_ISLOCKED(vp));
+
+	/* This operation only makes sense on directory nodes. */
+	if (vp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto out;
+	}
+
+	ip = VTOI(vp);
+
+	/* uiomove in chfs_filldir automatically increments the
+	 * uio_offset by an arbitrary size, so we discard any change
+	 * to uio_offset and set it to our own value on return
+	 */
+	offset = uio->uio_offset;
+
+	if (offset == CHFS_OFFSET_DOT) {
+		error = chfs_filldir(uio, ip->ino, ".", 1, VDIR);
+		if (error == -1) {
+			error = 0;
+			goto outok;
+		} else if (error != 0)
+			goto outok;
+
+		offset = CHFS_OFFSET_DOTDOT;
+	}
+
+	if (offset == CHFS_OFFSET_DOTDOT) {
+		ump = VFSTOUFS(vp->v_mount);
+		chmp = ump->um_chfs;
+		mutex_enter(&chmp->chm_lock_vnocache);
+		chvc = chfs_vnode_cache_get(chmp, ip->ino);
+		mutex_exit(&chmp->chm_lock_vnocache);
+
+		error = chfs_filldir(uio, chvc->pvno, "..", 2, VDIR);
+		if (error == -1) {
+			error = 0;
+			goto outok;
+		} else if (error != 0) {
+			goto outok;
+		}
+
+		if (TAILQ_EMPTY(&ip->dents)) {
+			offset = CHFS_OFFSET_EOF;
+		} else {
+			offset = CHFS_OFFSET_FIRST;
+		}
+	}
+
+	if (offset != CHFS_OFFSET_EOF) {
+		skip = offset - CHFS_OFFSET_FIRST;
+
+		TAILQ_FOREACH(fd, &ip->dents, fds) {
+			/* seek to offset by skipping items */
+			/* XXX race conditions by changed dirent? */
+			if (skip > 0) {
+				skip--;
+				continue;
+			}
+
+			if (fd->vno != 0) {
+				error = chfs_filldir(uio, fd->vno,
+				    fd->name, fd->nsize, fd->type);
+				if (error == -1) {
+					error = 0;
+					goto outok;
+				} else if (error != 0) {
+					dbg("err %d\n", error);
+					goto outok;
+				}
+			}
+			offset++;
+		}
+	}
+	offset = CHFS_OFFSET_EOF;
+
+outok:
+	uio->uio_offset = offset;
+
+	if (eofflag != NULL) {
+		*eofflag = (error == 0 &&
+		    uio->uio_offset == CHFS_OFFSET_EOF);
+	}
+
+out:
+	KASSERT(VOP_ISLOCKED(vp));
+
+	return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_readlink(void *v)
+{
+
+	struct vnode *vp = ((struct vop_readlink_args *) v)->a_vp;
+	struct uio *uio = ((struct vop_readlink_args *) v)->a_uio;
+	kauth_cred_t cred = ((struct vop_readlink_args *) v)->a_cred;
+
+	struct chfs_inode *ip = VTOI(vp);
+
+	dbg("readlink()\n");
+
+	/* TODO max symlink len instead of "100" */
+	if (ip->size < 100) {
+		uiomove(ip->target, ip->size, uio);
+		return (0);
+	}
+
+	return (VOP_READ(vp, uio, 0, cred));
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_inactive(void *v)
+{
+	struct vnode *vp = ((struct vop_inactive_args *) v)->a_vp;
+	struct chfs_inode *ip = VTOI(vp);
+	struct chfs_vnode_cache *chvc;
+	dbg("inactive | vno: %llu\n", (unsigned long long)ip->ino);
+
+	KASSERT(VOP_ISLOCKED(vp));
+
+	if (ip->ino) {
+		chvc = ip->chvc;
+		if (chvc->nlink)
+			*((struct vop_inactive_args *) v)->a_recycle = 0;
+	} else {
+		*((struct vop_inactive_args *) v)->a_recycle = 1;
+	}
+
+	VOP_UNLOCK(vp);
+
+	return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_reclaim(void *v)
+{
+	struct vop_reclaim_args *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct chfs_inode *ip = VTOI(vp);
+	struct chfs_mount *chmp = ip->chmp;
+	struct chfs_dirent *fd;
+
+	//dbg("reclaim() | ino: %llu\n", (unsigned long long)ip->ino);
+	//mutex_enter(&ip->inode_lock);
+
+	mutex_enter(&chmp->chm_lock_vnocache);
+	chfs_vnode_cache_set_state(chmp,
+	    ip->chvc, VNO_STATE_CHECKEDABSENT);
+	mutex_exit(&chmp->chm_lock_vnocache);
+
+	chfs_update(vp, NULL, NULL, UPDATE_CLOSE);
+
+	if (vp->v_type == VREG || vp->v_type == VLNK || vp->v_type == VCHR ||
+	    vp->v_type == VBLK || vp->v_type == VFIFO || vp->v_type == VSOCK)
+		chfs_kill_fragtree(&ip->fragtree);
+
+	fd = TAILQ_FIRST(&ip->dents);
+	while(fd) {
+		TAILQ_REMOVE(&ip->dents, fd, fds);
+		chfs_free_dirent(fd);
+		fd = TAILQ_FIRST(&ip->dents);
+	}
+	//mutex_exit(&ip->inode_lock);
+	//mutex_destroy(&ip->inode_lock);
+
+	cache_purge(vp);
+	if (ip->devvp) {
+		vrele(ip->devvp);
+		ip->devvp = 0;
+	}
+	chfs_ihashrem(ip);
+
+	genfs_node_destroy(vp);
+	pool_put(&chfs_inode_pool, vp->v_data);
+	vp->v_data = NULL;
+	return (0);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_advlock(void *v)
+{
+	//struct vnode *vp = ((struct vop_advlock_args *) v)->a_vp;
+	dbg("advlock()\n");
+	/*
+	  struct chfs_node *node;
+
+	  node = VP_TO_CHFS_NODE(vp);
+
+	  return lf_advlock(v, &node->chn_lockf, node->chn_size);
+	*/
+	return 0;
+}
+
+/* --------------------------------------------------------------------- */
+int
+chfs_strategy(void *v)
+{
+	struct vop_strategy_args /* {
+				    const struct vnodeop_desc *a_desc;
+				    struct vnode *a_vp;
+				    struct buf *a_bp;
+				    } */ *ap = v;
+	struct chfs_full_dnode *fd;
+	struct buf *bp = ap->a_bp;
+	struct vnode *vp = ap->a_vp;
+	struct chfs_inode *ip = VTOI(vp);
+	struct chfs_mount *chmp = ip->chmp;
+	int read = (bp->b_flags & B_READ) ? 1 : 0;
+	int err = 0;
+
+/*	dbg("bp dump:\n");
+	dbg("	->b_bcount: %d\n", bp->b_bcount);
+	dbg("	->b_resid:  %d\n", bp->b_resid);
+	dbg("	->b_blkno:  %llu\n", (unsigned long long)bp->b_blkno);
+	dbg("	->b_error:  %d\n", bp->b_error);*/
+	if (read) {
+		err = chfs_read_data(chmp, vp, bp);
+	} else {
+		fd = chfs_alloc_full_dnode();
+
+		mutex_enter(&chmp->chm_lock_mountfields);
+
+		err = chfs_write_flash_dnode(chmp, vp, bp, fd);
+		if (err) {
+			mutex_exit(&chmp->chm_lock_mountfields);
+			goto out;
+		}
+
+		err = chfs_add_full_dnode_to_inode(chmp, ip, fd);
+		/*if (err) {
+			mutex_exit(&chmp->chm_lock_mountfields);
+			goto out;
+		}*/
+
+		mutex_exit(&chmp->chm_lock_mountfields);
+	}
+out:
+	biodone(bp);
+	//dbg("end\n");
+	return err;
+}
+
+int
+chfs_bmap(void *v)
+{
+	struct vop_bmap_args /* {
+				struct vnode *a_vp;
+				daddr_t  a_bn;
+				struct vnode **a_vpp;
+				daddr_t *a_bnp;
+				int *a_runp;
+				int *a_runb;
+				} */ *ap = v;
+	if (ap->a_vpp != NULL)
+		*ap->a_vpp = ap->a_vp;
+	if (ap->a_bnp != NULL)
+		*ap->a_bnp = ap->a_bn;
+	if (ap->a_runp != NULL)
+		*ap->a_runp = 0;
+	return (0);
+}
+
+/*
+ * vnode operations vector used for files stored in a chfs file system.
+ */
+int
+(**chfs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc chfs_vnodeop_entries[] =
+	{
+		{ &vop_default_desc, vn_default_error },
+		{ &vop_lookup_desc, chfs_lookup },
+		{ &vop_create_desc, chfs_create },
+		{ &vop_mknod_desc, chfs_mknod },
+		{ &vop_open_desc, chfs_open },
+		{ &vop_close_desc, chfs_close },
+		{ &vop_access_desc, chfs_access },
+		{ &vop_getattr_desc, chfs_getattr },
+		{ &vop_setattr_desc, chfs_setattr },
+		{ &vop_read_desc, chfs_read },
+		{ &vop_write_desc, chfs_write },
+		{ &vop_ioctl_desc, genfs_enoioctl },
+		{ &vop_fcntl_desc, genfs_fcntl },
+		{ &vop_poll_desc, genfs_poll },
+		{ &vop_kqfilter_desc, genfs_kqfilter },
+		{ &vop_revoke_desc, genfs_revoke },
+		{ &vop_mmap_desc, genfs_mmap },
+		{ &vop_fsync_desc, chfs_fsync },
+		{ &vop_seek_desc, genfs_seek },
+		{ &vop_remove_desc, chfs_remove },
+		{ &vop_link_desc, chfs_link },
+		{ &vop_rename_desc, chfs_rename },
+		{ &vop_mkdir_desc, chfs_mkdir },
+		{ &vop_rmdir_desc, chfs_rmdir },
+		{ &vop_symlink_desc, chfs_symlink },
+		{ &vop_readdir_desc, chfs_readdir },
+		{ &vop_readlink_desc, chfs_readlink },
+		{ &vop_abortop_desc, genfs_abortop },
+		{ &vop_inactive_desc, chfs_inactive },
+		{ &vop_reclaim_desc, chfs_reclaim },
+		{ &vop_lock_desc, genfs_lock },
+		{ &vop_unlock_desc, genfs_unlock },
+		{ &vop_bmap_desc, chfs_bmap },
+		{ &vop_strategy_desc, chfs_strategy },
+		{ &vop_print_desc, ufs_print },
+		{ &vop_pathconf_desc, ufs_pathconf },
+		{ &vop_islocked_desc, genfs_islocked },
+		{ &vop_advlock_desc, chfs_advlock },
+		{ &vop_bwrite_desc, vn_bwrite },
+		{ &vop_getpages_desc, genfs_getpages },
+		{ &vop_putpages_desc, genfs_putpages },
+		{ NULL, NULL } };
+
+const struct vnodeopv_desc chfs_vnodeop_opv_desc =
+	{ &chfs_vnodeop_p, chfs_vnodeop_entries };
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * vnode operations vector used for special devices stored in a chfs
+ * file system.
+ */
+int
+(**chfs_specop_p)(void *);
+const struct vnodeopv_entry_desc chfs_specop_entries[] =
+	{
+		{ &vop_default_desc, vn_default_error },
+		{ &vop_lookup_desc, spec_lookup },
+		{ &vop_create_desc, spec_create },
+		{ &vop_mknod_desc, spec_mknod },
+		{ &vop_open_desc, spec_open },
+		{ &vop_close_desc, ufsspec_close },
+		{ &vop_access_desc, chfs_access },
+		{ &vop_getattr_desc, chfs_getattr },
+		{ &vop_setattr_desc, chfs_setattr },
+		{ &vop_read_desc, chfs_read },
+		{ &vop_write_desc, chfs_write },
+		{ &vop_ioctl_desc, spec_ioctl },
+		{ &vop_fcntl_desc, genfs_fcntl },
+		{ &vop_poll_desc, spec_poll },
+		{ &vop_kqfilter_desc, spec_kqfilter },
+		{ &vop_revoke_desc, spec_revoke },
+		{ &vop_mmap_desc, spec_mmap },
+		{ &vop_fsync_desc, spec_fsync },
+		{ &vop_seek_desc, spec_seek },
+		{ &vop_remove_desc, spec_remove },
+		{ &vop_link_desc, spec_link },
+		{ &vop_rename_desc, spec_rename },
+		{ &vop_mkdir_desc, spec_mkdir },
+		{ &vop_rmdir_desc, spec_rmdir },
+		{ &vop_symlink_desc, spec_symlink },
+		{ &vop_readdir_desc, spec_readdir },
+		{ &vop_readlink_desc, spec_readlink },
+		{ &vop_abortop_desc, spec_abortop },
+		{ &vop_inactive_desc, chfs_inactive },
+		{ &vop_reclaim_desc, chfs_reclaim },
+		{ &vop_lock_desc, genfs_lock },
+		{ &vop_unlock_desc, genfs_unlock },
+		{ &vop_bmap_desc, spec_bmap },
+		{ &vop_strategy_desc, spec_strategy },
+		{ &vop_print_desc, ufs_print },
+		{ &vop_pathconf_desc, spec_pathconf },
+		{ &vop_islocked_desc, genfs_islocked },
+		{ &vop_advlock_desc, spec_advlock },
+		{ &vop_bwrite_desc, vn_bwrite },
+		{ &vop_getpages_desc, spec_getpages },
+		{ &vop_putpages_desc, spec_putpages },
+		{ NULL, NULL } };
+
+const struct vnodeopv_desc chfs_specop_opv_desc =
+	{ &chfs_specop_p, chfs_specop_entries };
+
+/* --------------------------------------------------------------------- */
+/*
+ * vnode operations vector used for fifos stored in a chfs file system.
+ */
+int
+(**chfs_fifoop_p)(void *);
+const struct vnodeopv_entry_desc chfs_fifoop_entries[] =
+	{
+		{ &vop_default_desc, vn_default_error },
+		{ &vop_lookup_desc, vn_fifo_bypass },
+		{ &vop_create_desc, vn_fifo_bypass },
+		{ &vop_mknod_desc, vn_fifo_bypass },
+		{ &vop_open_desc, vn_fifo_bypass },
+		{ &vop_close_desc, ufsfifo_close },
+		{ &vop_access_desc, chfs_access },
+		{ &vop_getattr_desc, chfs_getattr },
+		{ &vop_setattr_desc, chfs_setattr },
+		{ &vop_read_desc, ufsfifo_read },
+		{ &vop_write_desc, ufsfifo_write },
+		{ &vop_ioctl_desc, vn_fifo_bypass },
+		{ &vop_fcntl_desc, genfs_fcntl },
+		{ &vop_poll_desc, vn_fifo_bypass },
+		{ &vop_kqfilter_desc, vn_fifo_bypass },
+		{ &vop_revoke_desc, vn_fifo_bypass },
+		{ &vop_mmap_desc, vn_fifo_bypass },
+		{ &vop_fsync_desc, vn_fifo_bypass },
+		{ &vop_seek_desc, vn_fifo_bypass },
+		{ &vop_remove_desc, vn_fifo_bypass },
+		{ &vop_link_desc, vn_fifo_bypass },
+		{ &vop_rename_desc, vn_fifo_bypass },
+		{ &vop_mkdir_desc, vn_fifo_bypass },
+		{ &vop_rmdir_desc, vn_fifo_bypass },
+		{ &vop_symlink_desc, vn_fifo_bypass },
+		{ &vop_readdir_desc, vn_fifo_bypass },
+		{ &vop_readlink_desc, vn_fifo_bypass },
+		{ &vop_abortop_desc, vn_fifo_bypass },
+		{ &vop_inactive_desc, chfs_inactive },
+		{ &vop_reclaim_desc, chfs_reclaim },
+		{ &vop_lock_desc, genfs_lock },
+		{ &vop_unlock_desc, genfs_unlock },
+		{ &vop_bmap_desc, vn_fifo_bypass },
+		{ &vop_strategy_desc, vn_fifo_bypass },
+		{ &vop_print_desc, ufs_print },
+		{ &vop_pathconf_desc, vn_fifo_bypass },
+		{ &vop_islocked_desc, genfs_islocked },
+		{ &vop_advlock_desc, vn_fifo_bypass },
+		{ &vop_bwrite_desc, genfs_nullop },
+		{ &vop_getpages_desc, genfs_badop },
+		{ &vop_putpages_desc, vn_fifo_bypass },
+		{ NULL, NULL } };
+
+const struct vnodeopv_desc chfs_fifoop_opv_desc =
+	{ &chfs_fifoop_p, chfs_fifoop_entries };
diff --git a/sys/ufs/chfs/chfs_wbuf.c b/sys/ufs/chfs/chfs_wbuf.c
new file mode 100644
index 000000000..c9823a696
--- /dev/null
+++ b/sys/ufs/chfs/chfs_wbuf.c
@@ -0,0 +1,259 @@
+/*	$NetBSD: chfs_wbuf.c,v 1.2 2011/11/24 20:50:33 agc Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <dev/flash/flash.h>
+#include <sys/uio.h>
+#include "chfs.h"
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
+#define DBG_WBUF 1
+
+#define PAD(x) (((x)+3)&~3)
+
+#define EB_ADDRESS(x) ( ((unsigned long)(x) / chmp->chm_ebh->eb_size) * chmp->chm_ebh->eb_size )
+
+#define PAGE_DIV(x) ( ((unsigned long)(x) / (unsigned long)(chmp->chm_wbuf_pagesize)) * (unsigned long)(chmp->chm_wbuf_pagesize) )
+#define PAGE_MOD(x) ( (unsigned long)(x) % (unsigned long)(chmp->chm_wbuf_pagesize) )
+
+/*
+// test functions
+int wbuf_test(void);
+void wbuf_test_erase_flash(struct chfs_mount*);
+void wbuf_test_callback(struct erase_instruction*);
+*/
+
+#define NOPAD	0
+#define SETPAD	1
+
+
+/**
+ * chfs_flush_wbuf - write wbuf to the flash
+ * @chmp: super block info
+ * @pad: padding (NOPAD / SETPAD)
+ * Returns zero in case of success.
+ */
+static int
+chfs_flush_wbuf(struct chfs_mount *chmp, int pad)
+{
+	int ret=0;
+	size_t retlen = 0;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+	KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+	KASSERT(rw_write_held(&chmp->chm_lock_wbuf));
+
+	if (pad) {
+		chmp->chm_wbuf_len = PAD(chmp->chm_wbuf_len);
+		memset(chmp->chm_wbuf + chmp->chm_wbuf_len, 0, chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len);
+
+		struct chfs_flash_padding_node* padnode = (void*)(chmp->chm_wbuf + chmp->chm_wbuf_len);
+		padnode->magic = htole16(CHFS_FS_MAGIC_BITMASK);
+		padnode->type = htole16(CHFS_NODETYPE_PADDING);
+		padnode->length = htole32(chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len);
+		padnode->hdr_crc = htole32(crc32(0, (uint8_t *)padnode, sizeof(*padnode)-4));
+
+		struct chfs_node_ref *nref;
+		nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+		nref->nref_offset = chmp->chm_wbuf_ofs + chmp->chm_wbuf_len;
+		nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) |
+		    CHFS_OBSOLETE_NODE_MASK;
+		chmp->chm_wbuf_len = chmp->chm_wbuf_pagesize;
+
+		chfs_change_size_free(chmp, chmp->chm_nextblock, -padnode->length);
+		chfs_change_size_wasted(chmp, chmp->chm_nextblock, padnode->length);
+	}
+
+	ret = chfs_write_leb(chmp, chmp->chm_nextblock->lnr, chmp->chm_wbuf, chmp->chm_wbuf_ofs, chmp->chm_wbuf_len, &retlen);
+	if(ret) {
+		return ret;
+	}
+
+	memset(chmp->chm_wbuf,0xff,chmp->chm_wbuf_pagesize);
+	chmp->chm_wbuf_ofs += chmp->chm_wbuf_pagesize;
+	chmp->chm_wbuf_len = 0;
+	return 0;
+}
+
+
+/**
+ * chfs_fill_wbuf - write to wbuf
+ * @chmp: super block info
+ * @buf: buffer
+ * @len: buffer length
+ * Return the len of the buf what we didn't write to the wbuf.
+ */
+static size_t
+chfs_fill_wbuf(struct chfs_mount *chmp, const u_char *buf, size_t len)
+{
+	if (len && !chmp->chm_wbuf_len && (len >= chmp->chm_wbuf_pagesize)) {
+		return 0;
+	}
+	if (len > (chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len)) {
+		len = chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len;
+	}
+	memcpy(chmp->chm_wbuf + chmp->chm_wbuf_len, buf, len);
+
+	chmp->chm_wbuf_len += (int) len;
+	return len;
+}
+
+/**
+ * chfs_write_wbuf - write to wbuf and then the flash
+ * @chmp: super block info
+ * @invecs: io vectors
+ * @count: num of vectors
+ * @to: offset of target
+ * @retlen: writed bytes
+ * Returns zero in case of success.
+ */
+int
+chfs_write_wbuf(struct chfs_mount* chmp, const struct iovec *invecs, long count,
+    off_t to, size_t *retlen)
+{
+	int invec, ret = 0;
+	size_t wbuf_retlen, donelen = 0;
+	int outvec_to = to;
+
+	int lnr = chmp->chm_nextblock->lnr;
+
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+	KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+	KASSERT(!rw_write_held(&chmp->chm_lock_wbuf));
+
+	rw_enter(&chmp->chm_lock_wbuf, RW_WRITER);
+
+	//dbg("1. wbuf ofs: %zu, len: %zu\n", chmp->chm_wbuf_ofs, chmp->chm_wbuf_len);
+
+	if (chmp->chm_wbuf_ofs == 0xffffffff) {
+		chmp->chm_wbuf_ofs = PAGE_DIV(to);
+		chmp->chm_wbuf_len = PAGE_MOD(to);
+		memset(chmp->chm_wbuf, 0xff, chmp->chm_wbuf_pagesize);
+	}
+
+	//dbg("2. wbuf ofs: %zu, len: %zu\n", chmp->chm_wbuf_ofs, chmp->chm_wbuf_len);
+
+	if (EB_ADDRESS(to) != EB_ADDRESS(chmp->chm_wbuf_ofs)) {
+		if (chmp->chm_wbuf_len) {
+			ret = chfs_flush_wbuf(chmp, SETPAD);
+			if (ret)
+				goto outerr;
+		}
+		chmp->chm_wbuf_ofs = PAGE_DIV(to);
+		chmp->chm_wbuf_len = PAGE_MOD(to);
+	}
+
+	//dbg("3. wbuf ofs: %zu, len: %zu\n", chmp->chm_wbuf_ofs, chmp->chm_wbuf_len);
+
+	if (to != PAD(chmp->chm_wbuf_ofs + chmp->chm_wbuf_len)) {
+		dbg("to: %llu != %zu\n", (unsigned long long)to,
+			PAD(chmp->chm_wbuf_ofs + chmp->chm_wbuf_len));
+		dbg("Non-contiguous write\n");
+		panic("BUG\n");
+	}
+
+	/* adjust alignment offset */
+	if (chmp->chm_wbuf_len != PAGE_MOD(to)) {
+		chmp->chm_wbuf_len = PAGE_MOD(to);
+		/* take care of alignement to next page*/
+		if (!chmp->chm_wbuf_len) {
+			chmp->chm_wbuf_len += chmp->chm_wbuf_pagesize;
+			ret = chfs_flush_wbuf(chmp, NOPAD);
+			if (ret)
+				goto outerr;
+		}
+	}
+
+	for (invec = 0; invec < count; invec++) {
+		int vlen = invecs[invec].iov_len;
+		u_char* v = invecs[invec].iov_base;
+
+		//dbg("invec:%d len:%d\n", invec, vlen);
+
+		wbuf_retlen = chfs_fill_wbuf(chmp, v, vlen);
+		if (chmp->chm_wbuf_len == chmp->chm_wbuf_pagesize) {
+			ret = chfs_flush_wbuf(chmp, NOPAD);
+			if (ret) {
+				goto outerr;
+			}
+		}
+		vlen -= wbuf_retlen;
+		outvec_to += wbuf_retlen;
+		v += wbuf_retlen;
+		donelen += wbuf_retlen;
+		if (vlen >= chmp->chm_wbuf_pagesize) {
+			ret = chfs_write_leb(chmp, lnr, v, outvec_to, PAGE_DIV(vlen), &wbuf_retlen);
+			//dbg("fd->write: %zu\n", wbuf_retlen);
+			vlen -= wbuf_retlen;
+			outvec_to += wbuf_retlen;
+			chmp->chm_wbuf_ofs = outvec_to;
+			v += wbuf_retlen;
+			donelen += wbuf_retlen;
+		}
+		wbuf_retlen = chfs_fill_wbuf(chmp, v, vlen);
+		if (chmp->chm_wbuf_len == chmp->chm_wbuf_pagesize) {
+			ret = chfs_flush_wbuf(chmp, NOPAD);
+			if (ret)
+				goto outerr;
+		}
+
+		// if we write the last vector, we flush with padding
+		/*if (invec == count-1) {
+		  ret = chfs_flush_wbuf(chmp, SETPAD);
+		  if (ret)
+		  goto outerr;
+		  }*/
+		outvec_to += wbuf_retlen;
+		donelen += wbuf_retlen;
+	}
+	*retlen = donelen;
+	rw_exit(&chmp->chm_lock_wbuf);
+	return ret;
+
+outerr:
+	*retlen = 0;
+	return ret;
+}
+
+int chfs_flush_pending_wbuf(struct chfs_mount *chmp)
+{
+	//dbg("flush pending wbuf\n");
+	int err;
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+	mutex_enter(&chmp->chm_lock_sizes);
+	rw_enter(&chmp->chm_lock_wbuf, RW_WRITER);
+	err = chfs_flush_wbuf(chmp, SETPAD);
+	rw_exit(&chmp->chm_lock_wbuf);
+	mutex_exit(&chmp->chm_lock_sizes);
+	return err;
+}
diff --git a/sys/ufs/chfs/chfs_write.c b/sys/ufs/chfs/chfs_write.c
new file mode 100644
index 000000000..0838ed9b1
--- /dev/null
+++ b/sys/ufs/chfs/chfs_write.c
@@ -0,0 +1,545 @@
+/*	$NetBSD: chfs_write.c,v 1.2 2011/11/24 21:09:37 agc Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * Copyright (C) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * chfs_write.c
+ *
+ *  Created on: 2010.02.17.
+ *      Author: dtengeri
+ */
+
+#include <sys/param.h>
+#include <sys/buf.h>
+
+#include "chfs.h"
+
+int
+chfs_write_flash_vnode(struct chfs_mount *chmp,
+    struct chfs_inode *ip, int prio)
+{
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	struct chfs_flash_vnode *fvnode;
+	struct chfs_vnode_cache* chvc;
+	struct chfs_node_ref *nref;
+	struct iovec vec;
+	size_t size, retlen;
+	int err = 0, retries = 0;
+
+	if (ip->ino == CHFS_ROOTINO)
+		return 0;
+
+	fvnode = chfs_alloc_flash_vnode();
+	if (!fvnode)
+		return ENOMEM;
+
+	chvc = ip->chvc;
+
+	/* setting up flash_vnode members */
+	size = sizeof(*fvnode);
+	//dbg("size: %zu | PADDED: %zu\n", size, CHFS_PAD(size));
+	fvnode->magic = htole16(CHFS_FS_MAGIC_BITMASK);
+	fvnode->type = htole16(CHFS_NODETYPE_VNODE);
+	fvnode->length = htole32(CHFS_PAD(size));
+	fvnode->hdr_crc = htole32(crc32(0, (uint8_t *)fvnode,
+		CHFS_NODE_HDR_SIZE - 4));
+	fvnode->vno = htole64(ip->ino);
+	fvnode->version = htole64(++ip->chvc->highest_version);
+	fvnode->mode = htole32(ip->mode);
+	fvnode->dn_size = htole32(ip->size);
+	fvnode->atime = htole32(ip->atime);
+	fvnode->ctime = htole32(ip->ctime);
+	fvnode->mtime = htole32(ip->mtime);
+	fvnode->gid = htole32(ip->gid);
+	fvnode->uid = htole32(ip->uid);
+	fvnode->node_crc = htole32(crc32(0, (uint8_t *)fvnode, size - 4));
+
+	/* write out flash_vnode */
+retry:
+	if (prio == ALLOC_GC) {
+		/* the GC calls this function */
+		err = chfs_reserve_space_gc(chmp, CHFS_PAD(size));
+		if (err)
+			goto out;
+	} else {
+		chfs_gc_trigger(chmp);
+		if (prio == ALLOC_NORMAL)
+			err = chfs_reserve_space_normal(chmp,
+			    CHFS_PAD(size), ALLOC_NORMAL);
+		else
+			err = chfs_reserve_space_normal(chmp,
+			    CHFS_PAD(size), ALLOC_DELETION);
+		if (err)
+			goto out;
+	}
+
+	nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+	if (!nref) {
+		err = ENOMEM;
+		goto out;
+	}
+
+	mutex_enter(&chmp->chm_lock_sizes);
+
+	nref->nref_offset = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+	chfs_change_size_free(chmp, chmp->chm_nextblock, -CHFS_PAD(size));
+	vec.iov_base = fvnode;
+	vec.iov_len = CHFS_PAD(size);
+	err = chfs_write_wbuf(chmp, &vec, 1, nref->nref_offset, &retlen);
+	if (err || retlen != CHFS_PAD(size)) {
+		chfs_err("error while writing out flash vnode to the media\n");
+		chfs_err("err: %d | size: %zu | retlen : %zu\n",
+		    err, CHFS_PAD(size), retlen);
+		chfs_change_size_dirty(chmp,
+		    chmp->chm_nextblock, CHFS_PAD(size));
+		if (retries) {
+			err = EIO;
+			mutex_exit(&chmp->chm_lock_sizes);
+			goto out;
+		}
+
+		retries++;
+		mutex_exit(&chmp->chm_lock_sizes);
+		goto retry;
+	}
+	//Everything went well
+	chfs_change_size_used(chmp,
+	    &chmp->chm_blocks[nref->nref_lnr], CHFS_PAD(size));
+	mutex_exit(&chmp->chm_lock_sizes);
+	
+	chfs_add_vnode_ref_to_vc(chmp, chvc, nref);
+	KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+out:
+	chfs_free_flash_vnode(fvnode);
+	return err;
+}
+
+int
+chfs_write_flash_dirent(struct chfs_mount *chmp, struct chfs_inode *pdir,
+    struct chfs_inode *ip, struct chfs_dirent *fd,
+    ino_t ino, int prio)
+{
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	struct chfs_flash_dirent_node *fdirent;
+	struct chfs_node_ref *nref;
+	struct iovec vec[2];
+	size_t size, retlen;
+	int err = 0, retries = 0;
+	uint8_t *name;
+	size_t namelen;
+
+	KASSERT(fd->vno != CHFS_ROOTINO);
+
+	fdirent = chfs_alloc_flash_dirent();
+	if (!fdirent)
+		return ENOMEM;
+
+	size = sizeof(*fdirent) + fd->nsize;
+	namelen = CHFS_PAD(size) - sizeof(*fdirent);
+
+	name = kmem_zalloc(namelen, KM_SLEEP);
+	memcpy(name, fd->name, fd->nsize);
+	//dbg("namelen: %zu | nsize: %hhu\n", namelen, fd->nsize);
+
+
+	//dbg("size: %zu | PADDED: %zu\n", size, CHFS_PAD(size));
+	fdirent->magic = htole16(CHFS_FS_MAGIC_BITMASK);
+	fdirent->type = htole16(CHFS_NODETYPE_DIRENT);
+	fdirent->length = htole32(CHFS_PAD(size));
+	fdirent->hdr_crc = htole32(crc32(0, (uint8_t *)fdirent,
+		CHFS_NODE_HDR_SIZE - 4));
+	fdirent->vno = htole64(ino);
+	fdirent->pvno = htole64(pdir->ino);
+	fdirent->version = htole64(++pdir->chvc->highest_version);
+	fdirent->mctime = ip?ip->ctime:0;
+	fdirent->nsize = fd->nsize;
+	fdirent->dtype = fd->type;
+	fdirent->name_crc = crc32(0, (uint8_t *)&(fd->name), fd->nsize);
+	fdirent->node_crc = crc32(0, (uint8_t *)fdirent, sizeof(*fdirent) - 4);
+
+	vec[0].iov_base = fdirent;
+	vec[0].iov_len  = sizeof(*fdirent);
+	vec[1].iov_base = name;
+	vec[1].iov_len  = namelen;
+	
+retry:
+	if (prio == ALLOC_GC) {
+		/* the GC calls this function */
+		err = chfs_reserve_space_gc(chmp, CHFS_PAD(size));
+		if (err)
+			goto out;
+	} else {
+		chfs_gc_trigger(chmp);
+		if (prio == ALLOC_NORMAL)
+			err = chfs_reserve_space_normal(chmp,
+			    CHFS_PAD(size), ALLOC_NORMAL);
+		else
+			err = chfs_reserve_space_normal(chmp,
+			    CHFS_PAD(size), ALLOC_DELETION);
+		if (err)
+			goto out;
+	}
+
+	nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+	if (!nref) {
+		err = ENOMEM;
+		goto out;
+	}
+
+	mutex_enter(&chmp->chm_lock_sizes);
+
+	nref->nref_offset = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+	chfs_change_size_free(chmp, chmp->chm_nextblock, -CHFS_PAD(size));
+
+	err = chfs_write_wbuf(chmp, vec, 2, nref->nref_offset, &retlen);
+	if (err || retlen != CHFS_PAD(size)) {
+		chfs_err("error while writing out flash dirent node to the media\n");
+		chfs_err("err: %d | size: %zu | retlen : %zu\n",
+		    err, CHFS_PAD(size), retlen);
+		chfs_change_size_dirty(chmp,
+		    chmp->chm_nextblock, CHFS_PAD(size));
+		if (retries) {
+			err = EIO;
+			mutex_exit(&chmp->chm_lock_sizes);
+			goto out;
+		}
+
+		retries++;
+		mutex_exit(&chmp->chm_lock_sizes);
+		goto retry;
+	}
+
+
+	// Everything went well
+	chfs_change_size_used(chmp,
+	    &chmp->chm_blocks[nref->nref_lnr], CHFS_PAD(size));
+	mutex_exit(&chmp->chm_lock_sizes);
+	KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+	fd->nref = nref;
+	if (prio != ALLOC_DELETION) {
+		chfs_add_node_to_list(chmp,
+			pdir->chvc, nref, &pdir->chvc->dirents);
+	}
+out:
+	chfs_free_flash_dirent(fdirent);
+	return err;
+}
+
+/**
+ * chfs_write_flash_dnode - write out a data node to flash
+ * @chmp: chfs mount structure
+ * @vp: vnode where the data belongs to
+ * @bp: buffer contains data
+ */
+int
+chfs_write_flash_dnode(struct chfs_mount *chmp, struct vnode *vp,
+    struct buf *bp, struct chfs_full_dnode *fd)
+{
+	KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+	int err = 0, retries = 0;
+	size_t size, retlen;
+	off_t ofs;
+	struct chfs_flash_data_node *dnode;
+	struct chfs_node_ref *nref;
+	struct chfs_inode *ip = VTOI(vp);
+	struct iovec vec[2];
+	uint32_t len;
+	void *tmpbuf = NULL;
+
+	KASSERT(ip->ino != CHFS_ROOTINO);
+
+	dnode = chfs_alloc_flash_dnode();
+	if (!dnode)
+		return ENOMEM;
+
+	/* initialize flash data node */
+	ofs = bp->b_blkno * PAGE_SIZE;
+	//dbg("vp->v_size: %ju, bp->b_blkno: %ju, bp-b_data: %p,"
+	//    " bp->b_resid: %ju\n",
+	//    (uintmax_t )vp->v_size, (uintmax_t )bp->b_blkno,
+	//    bp->b_data, (uintmax_t )bp->b_resid);
+	//dbg("[XXX]vp->v_size - ofs: %llu\n", (vp->v_size - ofs));
+	len = MIN((vp->v_size - ofs), bp->b_resid);
+	size = sizeof(*dnode) + len;
+
+	dnode->magic = htole16(CHFS_FS_MAGIC_BITMASK);
+	dnode->type = htole16(CHFS_NODETYPE_DATA);
+	dnode->length = htole32(CHFS_PAD(size));
+	dnode->hdr_crc = htole32(crc32(0, (uint8_t *)dnode,
+		CHFS_NODE_HDR_SIZE - 4));
+	dnode->vno = htole64(ip->ino);
+	dnode->version = htole64(++ip->chvc->highest_version);
+	dnode->offset = htole64(ofs);
+	dnode->data_length = htole32(len);
+	dnode->data_crc = htole32(crc32(0, (uint8_t *)bp->b_data, len));
+	dnode->node_crc = htole32(crc32(0, (uint8_t *)dnode,
+		sizeof(*dnode) - 4));
+
+	dbg("dnode @%llu %ub v%llu\n", (unsigned long long)dnode->offset,
+		dnode->data_length, (unsigned long long)dnode->version);
+
+	if (CHFS_PAD(size) - sizeof(*dnode)) {
+		tmpbuf = kmem_zalloc(CHFS_PAD(size)
+		    - sizeof(*dnode), KM_SLEEP);
+		memcpy(tmpbuf, bp->b_data, len);
+	}
+
+	/* creating iovecs for wbuf */
+	vec[0].iov_base = dnode;
+	vec[0].iov_len = sizeof(*dnode);
+	vec[1].iov_base = tmpbuf;
+	vec[1].iov_len = CHFS_PAD(size) - sizeof(*dnode);
+
+	fd->frags = 0;
+	fd->ofs = ofs;
+	fd->size = len;
+
+retry:
+
+	/* Reserve space for data node. This will set up the next eraseblock
+	 * where to we will write.
+	 */
+
+	chfs_gc_trigger(chmp);
+	err = chfs_reserve_space_normal(chmp,
+	    CHFS_PAD(size), ALLOC_NORMAL);
+	if (err)
+		goto out;
+
+	nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+	if (!nref) {
+		err = ENOMEM;
+		goto out;
+	}
+
+	nref->nref_offset =
+	    chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+
+	KASSERT(nref->nref_offset < chmp->chm_ebh->eb_size);
+	
+	mutex_enter(&chmp->chm_lock_sizes);
+
+	chfs_change_size_free(chmp,
+	    chmp->chm_nextblock, -CHFS_PAD(size));
+
+	//dbg("vno: %llu nref lnr: %u offset: %u\n",
+	//    dnode->vno, nref->nref_lnr, nref->nref_offset);
+
+	err = chfs_write_wbuf(chmp, vec, 2, nref->nref_offset, &retlen);
+	if (err || retlen != CHFS_PAD(size)) {
+		chfs_err("error while writing out flash data node to the media\n");
+		chfs_err("err: %d | size: %zu | retlen : %zu\n",
+		    err, size, retlen);
+		chfs_change_size_dirty(chmp,
+		    chmp->chm_nextblock, CHFS_PAD(size));
+		if (retries) {
+			err = EIO;
+			mutex_exit(&chmp->chm_lock_sizes);
+			goto out;
+		}
+
+		retries++;
+		mutex_exit(&chmp->chm_lock_sizes);
+		goto retry;
+	}
+	/* Everything went well */
+	ip->write_size += fd->size;
+	chfs_change_size_used(chmp,
+	    &chmp->chm_blocks[nref->nref_lnr], CHFS_PAD(size));
+	mutex_exit(&chmp->chm_lock_sizes);
+
+	KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+	fd->nref = nref;
+	chfs_add_node_to_list(chmp, ip->chvc, nref, &ip->chvc->dnode);
+out:
+	chfs_free_flash_dnode(dnode);
+	if (CHFS_PAD(size) - sizeof(*dnode)) {
+		kmem_free(tmpbuf, CHFS_PAD(size) - sizeof(*dnode));
+	}
+
+	return err;
+}
+
+/**
+ * chfs_do_link - makes a copy from a node
+ * @old: old node
+ * @oldfd: dirent of old node
+ * @parent: parent of new node
+ * @name: name of new node
+ * @namelen: length of name
+ * This function writes the dirent of the new node to the media.
+ */
+int
+chfs_do_link(struct chfs_inode *ip, struct chfs_inode *parent, const char *name, int namelen, enum vtype type)
+{
+	int error = 0;
+	struct vnode *vp = ITOV(ip);
+	struct ufsmount *ump = VFSTOUFS(vp->v_mount);
+	struct chfs_mount *chmp = ump->um_chfs;
+	struct chfs_dirent *newfd = NULL;
+//	struct chfs_dirent *fd = NULL;
+
+	//dbg("link vno: %llu\n", ip->ino);
+
+	newfd = chfs_alloc_dirent(namelen + 1);
+
+	newfd->vno = ip->ino;
+	newfd->type = type;
+	newfd->nsize = namelen;
+	memcpy(newfd->name, name, namelen);
+	newfd->name[newfd->nsize] = 0;
+//	newfd->next = NULL;
+
+	ip->chvc->nlink++;
+	parent->chvc->nlink++;
+	ip->iflag |= IN_CHANGE;
+	chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+
+	mutex_enter(&chmp->chm_lock_mountfields);
+
+	error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL);
+	if (error)
+		return error;
+
+	error = chfs_write_flash_dirent(chmp,
+	    parent, ip, newfd, ip->ino, ALLOC_NORMAL);
+	/* TODO: what should we do if error isn't zero? */
+
+	mutex_exit(&chmp->chm_lock_mountfields);
+
+	/* add fd to the fd list */
+	TAILQ_INSERT_TAIL(&parent->dents, newfd, fds);
+#if 0
+	fd = parent->dents;
+	if (!fd) {
+		parent->dents = newfd;
+	} else {
+		while (fd->next)
+			fd = fd->next;
+		fd->next = newfd;
+	}
+#endif
+
+	return error;
+}
+
+
+/**
+ * chfs_do_unlink - delete a node
+ * @ip: node what we'd like to delete
+ * @parent: parent of the node
+ * @name: name of the node
+ * @namelen: length of name
+ * This function set the nlink and vno of the node zero and write its dirent to the media.
+ */
+int
+chfs_do_unlink(struct chfs_inode *ip,
+    struct chfs_inode *parent, const char *name, int namelen)
+{
+	struct chfs_dirent *fd, *tmpfd;
+	int error = 0;
+	struct vnode *vp = ITOV(ip);
+	struct ufsmount *ump = VFSTOUFS(vp->v_mount);
+	struct chfs_mount *chmp = ump->um_chfs;
+	struct chfs_node_ref *nref;
+
+	//dbg("unlink vno: %llu\n", ip->ino);
+
+	vflushbuf(vp, 0);
+
+	mutex_enter(&chmp->chm_lock_mountfields);
+
+	/* remove the full direntry from the parent dents list */
+	TAILQ_FOREACH_SAFE(fd, &parent->dents, fds, tmpfd) {
+		if (fd->vno == ip->ino &&
+		    fd->nsize == namelen &&
+		    !memcmp(fd->name, name, fd->nsize)) {
+			if (fd->type == VDIR && ip->chvc->nlink == 2)
+				ip->chvc->nlink = 0;
+			else
+				ip->chvc->nlink--;
+
+			fd->type = VNON;
+
+			TAILQ_REMOVE(&parent->dents, fd, fds);
+
+			/* remove nref from dirents list */
+			nref = parent->chvc->dirents;
+			if (nref == fd->nref) {
+				nref->nref_next = fd->nref->nref_next;
+			} else {
+				while (nref->nref_next && nref->nref_next != fd->nref)
+					nref = nref->nref_next;
+				if (nref->nref_next)
+					nref->nref_next = fd->nref->nref_next;
+			}
+
+			//dbg("FD->NREF vno: %llu, lnr: %u, ofs: %u\n",
+			//    fd->vno, fd->nref->nref_lnr, fd->nref->nref_offset);
+			chfs_mark_node_obsolete(chmp, fd->nref);
+
+			error = chfs_write_flash_dirent(chmp,
+			    parent, ip, fd, 0, ALLOC_DELETION);
+
+			//dbg("FD->NREF vno: %llu, lnr: %u, ofs: %u\n",
+			//    fd->vno, fd->nref->nref_lnr, fd->nref->nref_offset);
+			chfs_mark_node_obsolete(chmp, fd->nref);
+
+			nref = ip->chvc->dnode;
+			while (nref != (struct chfs_node_ref *)ip->chvc) {
+				//dbg("DATA NREF\n");
+				chfs_mark_node_obsolete(chmp, nref);
+				nref = nref->nref_next;
+			}
+			ip->chvc->dnode = (struct chfs_node_ref *)ip->chvc;
+
+			nref = ip->chvc->v;
+			while (nref != (struct chfs_node_ref *)ip->chvc) {
+				//dbg("V NREF\n");
+				chfs_mark_node_obsolete(chmp, nref);
+				nref = nref->nref_next;
+			}
+			ip->chvc->v = ip->chvc->v->nref_next;
+
+			parent->chvc->nlink--;
+			//TODO: if error
+		}
+	}
+	mutex_exit(&chmp->chm_lock_mountfields);
+
+	return error;
+}
diff --git a/sys/ufs/chfs/debug.c b/sys/ufs/chfs/debug.c
new file mode 100644
index 000000000..0d1fa5b52
--- /dev/null
+++ b/sys/ufs/chfs/debug.c
@@ -0,0 +1,48 @@
+/*	$NetBSD: debug.c,v 1.1 2011/11/24 15:51:32 ahoka Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * XipFFS -- Xip Flash File System
+ *
+ * Copyright (C) 2009  Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ *                     Zoltan Sogor <weth@inf.u-szeged.hu>,
+ *                     ...
+ *                     University of Szeged, Hungary
+ *
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include "chfs.h"
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
diff --git a/include/ufs/chfs/debug.h b/sys/ufs/chfs/debug.h
similarity index 100%
rename from include/ufs/chfs/debug.h
rename to sys/ufs/chfs/debug.h
diff --git a/sys/ufs/chfs/ebh.c b/sys/ufs/chfs/ebh.c
new file mode 100644
index 000000000..ff0d984ee
--- /dev/null
+++ b/sys/ufs/chfs/ebh.c
@@ -0,0 +1,2141 @@
+/*	$NetBSD: ebh.c,v 1.2 2011/11/25 11:15:24 ahoka Exp $	*/
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *		      University of Szeged, Hungary
+ * Copyright (C) 2009 Ferenc Havasi <havasi@inf.u-szeged.hu>
+ * Copyright (C) 2009 Zoltan Sogor <weth@inf.u-szeged.hu>
+ * Copyright (C) 2009 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * Copyright (C) 2009 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "ebh.h"
+
+/*****************************************************************************/
+/* Flash specific operations						     */
+/*****************************************************************************/
+int nor_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr);
+int nand_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr);
+int nor_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset);
+int nand_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset);
+int nor_read_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr);
+int nand_read_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr);
+int nor_write_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr);
+int nand_write_eb_hdr(struct chfs_ebh *ebh, int pebnr,struct chfs_eb_hdr *ebhdr);
+int nor_check_eb_hdr(struct chfs_ebh *ebh, void *buf);
+int nand_check_eb_hdr(struct chfs_ebh *ebh, void *buf);
+int nor_mark_eb_hdr_dirty_flash(struct chfs_ebh *ebh, int pebnr, int lid);
+int nor_invalidate_eb_hdr(struct chfs_ebh *ebh, int pebnr);
+int mark_eb_hdr_free(struct chfs_ebh *ebh, int pebnr, int ec);
+
+int ltree_entry_cmp(struct chfs_ltree_entry *le1, struct chfs_ltree_entry *le2);
+int peb_in_use_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2);
+int peb_free_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2);
+int add_peb_to_erase_queue(struct chfs_ebh *ebh, int pebnr, int ec,struct peb_queue *queue);
+struct chfs_peb * find_peb_in_use(struct chfs_ebh *ebh, int pebnr);
+int add_peb_to_free(struct chfs_ebh *ebh, int pebnr, int ec);
+int add_peb_to_in_use(struct chfs_ebh *ebh, int pebnr, int ec);
+void erase_callback(struct flash_erase_instruction *ei);
+int free_peb(struct chfs_ebh *ebh);
+int release_peb(struct chfs_ebh *ebh, int pebnr);
+void erase_thread(void *data);
+static void erase_thread_start(struct chfs_ebh *ebh);
+static void erase_thread_stop(struct chfs_ebh *ebh);
+int scan_leb_used_cmp(struct chfs_scan_leb *sleb1, struct chfs_scan_leb *sleb2);
+int nor_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si,struct chfs_eb_hdr *ebhdr, int pebnr, int leb_status);
+int nor_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+    int pebnr, struct chfs_eb_hdr *ebhdr);
+int nand_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si,struct chfs_eb_hdr *ebhdr, int pebnr);
+int nand_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+    int pebnr, struct chfs_eb_hdr *ebhdr);
+struct chfs_scan_info *chfs_scan(struct chfs_ebh *ebh);
+void scan_info_destroy(struct chfs_scan_info *si);
+int scan_media(struct chfs_ebh *ebh);
+int get_peb(struct chfs_ebh *ebh);
+/**
+ * nor_create_eb_hdr - creates an eraseblock header for NOR flash
+ * @ebhdr: ebhdr to set
+ * @lnr: LEB number
+ */
+int
+nor_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr)
+{
+	ebhdr->u.nor_hdr.lid = htole32(lnr);
+	return 0;
+}
+
+/**
+ * nand_create_eb_hdr - creates an eraseblock header for NAND flash
+ * @ebhdr: ebhdr to set
+ * @lnr: LEB number
+ */
+int
+nand_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr)
+{
+	ebhdr->u.nand_hdr.lid = htole32(lnr);
+	return 0;
+}
+
+/**
+ * nor_calc_data_offs - calculates data offset on NOR flash
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @offset: offset within the eraseblock
+ */
+int
+nor_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset)
+{
+	return pebnr * ebh->flash_if->erasesize + offset +
+	    CHFS_EB_EC_HDR_SIZE + CHFS_EB_HDR_NOR_SIZE;
+}
+
+/**
+ * nand_calc_data_offs - calculates data offset on NAND flash
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @offset: offset within the eraseblock
+ */
+int
+nand_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset)
+{
+	return pebnr * ebh->flash_if->erasesize + offset +
+	    2 * ebh->flash_if->page_size;
+}
+
+/**
+ * nor_read_eb_hdr - read ereaseblock header from NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @ebhdr: whereto store the data
+ *
+ * Reads the eraseblock header from media.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_read_eb_hdr(struct chfs_ebh *ebh,
+    int pebnr, struct chfs_eb_hdr *ebhdr)
+{
+	int ret;
+	size_t retlen;
+	off_t ofs = pebnr * ebh->flash_if->erasesize;
+
+	KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr);
+
+	ret = flash_read(ebh->flash_dev,
+	    ofs, CHFS_EB_EC_HDR_SIZE,
+	    &retlen, (unsigned char *) &ebhdr->ec_hdr);
+
+	if (ret || retlen != CHFS_EB_EC_HDR_SIZE)
+		return ret;
+
+	ofs += CHFS_EB_EC_HDR_SIZE;
+	ret = flash_read(ebh->flash_dev,
+	    ofs, CHFS_EB_HDR_NOR_SIZE,
+	    &retlen, (unsigned char *) &ebhdr->u.nor_hdr);
+
+	if (ret || retlen != CHFS_EB_HDR_NOR_SIZE)
+		return ret;
+
+	return 0;
+}
+
+/**
+ * nand_read_eb_hdr - read ereaseblock header from NAND flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @ebhdr: whereto store the data
+ *
+ * Reads the eraseblock header from media. It is on the first two page.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nand_read_eb_hdr(struct chfs_ebh *ebh, int pebnr,
+    struct chfs_eb_hdr *ebhdr)
+{
+	int ret;
+	size_t retlen;
+	off_t ofs;
+
+	KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr);
+
+	/* Read erase counter header from the first page. */
+	ofs = pebnr * ebh->flash_if->erasesize;
+	ret = flash_read(ebh->flash_dev,
+	    ofs, CHFS_EB_EC_HDR_SIZE, &retlen,
+	    (unsigned char *) &ebhdr->ec_hdr);
+	if (ret || retlen != CHFS_EB_EC_HDR_SIZE)
+		return ret;
+
+	/* Read NAND eraseblock header from the second page */
+	ofs += ebh->flash_if->page_size;
+	ret = flash_read(ebh->flash_dev,
+	    ofs, CHFS_EB_HDR_NAND_SIZE, &retlen,
+	    (unsigned char *) &ebhdr->u.nand_hdr);
+	if (ret || retlen != CHFS_EB_HDR_NAND_SIZE)
+		return ret;
+
+	return 0;
+}
+
+/**
+ * nor_write_eb_hdr - write ereaseblock header to NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number whereto write
+ * @ebh: ebh to write
+ *
+ * Writes the eraseblock header to media.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_write_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr)
+{
+	int ret, crc;
+	size_t retlen;
+
+	off_t ofs = pebnr * ebh->flash_if->erasesize + CHFS_EB_EC_HDR_SIZE;
+
+	ebhdr->u.nor_hdr.lid = ebhdr->u.nor_hdr.lid
+	    | htole32(CHFS_LID_NOT_DIRTY_BIT);
+
+	crc = crc32(0, (uint8_t *)&ebhdr->u.nor_hdr + 4,
+	    CHFS_EB_HDR_NOR_SIZE - 4);
+	ebhdr->u.nand_hdr.crc = htole32(crc);
+
+	KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr);
+
+	ret = flash_write(ebh->flash_dev,
+	    ofs, CHFS_EB_HDR_NOR_SIZE, &retlen,
+	    (unsigned char *) &ebhdr->u.nor_hdr);
+
+	if (ret || retlen != CHFS_EB_HDR_NOR_SIZE)
+		return ret;
+
+	return 0;
+}
+
+/**
+ * nand_write_eb_hdr - write ereaseblock header to NAND flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number whereto write
+ * @ebh: ebh to write
+ *
+ * Writes the eraseblock header to media.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nand_write_eb_hdr(struct chfs_ebh *ebh, int pebnr,
+    struct chfs_eb_hdr *ebhdr)
+{
+	int ret, crc;
+	size_t retlen;
+	flash_off_t ofs;
+
+	KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr);
+
+	ofs = pebnr * ebh->flash_if->erasesize +
+	    ebh->flash_if->page_size;
+
+	ebhdr->u.nand_hdr.serial = htole64(++(*ebh->max_serial));
+
+	crc = crc32(0, (uint8_t *)&ebhdr->u.nand_hdr + 4,
+	    CHFS_EB_HDR_NAND_SIZE - 4);
+	ebhdr->u.nand_hdr.crc = htole32(crc);
+
+	ret = flash_write(ebh->flash_dev, ofs,
+	    CHFS_EB_HDR_NAND_SIZE, &retlen,
+	    (unsigned char *) &ebhdr->u.nand_hdr);
+
+	if (ret || retlen != CHFS_EB_HDR_NAND_SIZE)
+		return ret;
+
+	return 0;
+}
+
+/**
+ * nor_check_eb_hdr - check ereaseblock header read from NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @buf: eraseblock header to check
+ *
+ * Returns eraseblock header status.
+ */
+int
+nor_check_eb_hdr(struct chfs_ebh *ebh, void *buf)
+{
+	uint32_t magic, crc, hdr_crc;
+	struct chfs_eb_hdr *ebhdr = buf;
+	le32 lid_save;
+
+	//check is there a header
+	if (check_pattern((void *) &ebhdr->ec_hdr,
+		0xFF, 0, CHFS_EB_EC_HDR_SIZE)) {
+		dbg_ebh("no header found\n");
+		return EBHDR_LEB_NO_HDR;
+	}
+
+	// check magic
+	magic = le32toh(ebhdr->ec_hdr.magic);
+	if (magic != CHFS_MAGIC_BITMASK) {
+		dbg_ebh("bad magic bitmask(exp: %x found %x)\n",
+		    CHFS_MAGIC_BITMASK, magic);
+		return EBHDR_LEB_BADMAGIC;
+	}
+
+	// check CRC_EC
+	hdr_crc = le32toh(ebhdr->ec_hdr.crc_ec);
+	crc = crc32(0, (uint8_t *) &ebhdr->ec_hdr + 8, 4);
+	if (hdr_crc != crc) {
+		dbg_ebh("bad crc_ec found\n");
+		return EBHDR_LEB_BADCRC;
+	}
+
+	/* check if the PEB is free: magic, crc_ec and erase_cnt is good and
+	 * everything else is FFF..
+	 */
+	if (check_pattern((void *) &ebhdr->u.nor_hdr, 0xFF, 0,
+		CHFS_EB_HDR_NOR_SIZE)) {
+		dbg_ebh("free peb found\n");
+		return EBHDR_LEB_FREE;
+	}
+
+	// check invalidated (CRC == LID == 0)
+	if (ebhdr->u.nor_hdr.crc == 0 && ebhdr->u.nor_hdr.lid == 0) {
+		dbg_ebh("invalidated ebhdr found\n");
+		return EBHDR_LEB_INVALIDATED;
+	}
+
+	// check CRC
+	hdr_crc = le32toh(ebhdr->u.nor_hdr.crc);
+	lid_save = ebhdr->u.nor_hdr.lid;
+
+	// mark lid as not dirty for crc calc
+	ebhdr->u.nor_hdr.lid = ebhdr->u.nor_hdr.lid | htole32(
+		CHFS_LID_NOT_DIRTY_BIT);
+	crc = crc32(0, (uint8_t *) &ebhdr->u.nor_hdr + 4,
+	    CHFS_EB_HDR_NOR_SIZE - 4);
+	// restore the original lid value in ebh
+	ebhdr->u.nor_hdr.lid = lid_save;
+
+	if (crc != hdr_crc) {
+		dbg_ebh("bad crc found\n");
+		return EBHDR_LEB_BADCRC;
+	}
+
+	// check dirty
+	if (!(le32toh(lid_save) & CHFS_LID_NOT_DIRTY_BIT)) {
+		dbg_ebh("dirty ebhdr found\n");
+		return EBHDR_LEB_DIRTY;
+	}
+
+	return EBHDR_LEB_OK;
+}
+
+/**
+ * nand_check_eb_hdr - check ereaseblock header read from NAND flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @buf: eraseblock header to check
+ *
+ * Returns eraseblock header status.
+ */
+int
+nand_check_eb_hdr(struct chfs_ebh *ebh, void *buf)
+{
+	uint32_t magic, crc, hdr_crc;
+	struct chfs_eb_hdr *ebhdr = buf;
+
+	//check is there a header
+	if (check_pattern((void *) &ebhdr->ec_hdr,
+		0xFF, 0, CHFS_EB_EC_HDR_SIZE)) {
+		dbg_ebh("no header found\n");
+		return EBHDR_LEB_NO_HDR;
+	}
+
+	// check magic
+	magic = le32toh(ebhdr->ec_hdr.magic);
+	if (magic != CHFS_MAGIC_BITMASK) {
+		dbg_ebh("bad magic bitmask(exp: %x found %x)\n",
+		    CHFS_MAGIC_BITMASK, magic);
+		return EBHDR_LEB_BADMAGIC;
+	}
+
+	// check CRC_EC
+	hdr_crc = le32toh(ebhdr->ec_hdr.crc_ec);
+	crc = crc32(0, (uint8_t *) &ebhdr->ec_hdr + 8, 4);
+	if (hdr_crc != crc) {
+		dbg_ebh("bad crc_ec found\n");
+		return EBHDR_LEB_BADCRC;
+	}
+
+	/* check if the PEB is free: magic, crc_ec and erase_cnt is good and
+	 * everything else is FFF..
+	 */
+	if (check_pattern((void *) &ebhdr->u.nand_hdr, 0xFF, 0,
+		CHFS_EB_HDR_NAND_SIZE)) {
+		dbg_ebh("free peb found\n");
+		return EBHDR_LEB_FREE;
+	}
+
+	// check CRC
+	hdr_crc = le32toh(ebhdr->u.nand_hdr.crc);
+
+	crc = crc32(0, (uint8_t *) &ebhdr->u.nand_hdr + 4,
+	    CHFS_EB_HDR_NAND_SIZE - 4);
+
+	if (crc != hdr_crc) {
+		dbg_ebh("bad crc found\n");
+		return EBHDR_LEB_BADCRC;
+	}
+
+	return EBHDR_LEB_OK;
+}
+
+/**
+ * nor_mark_eb_hdr_dirty_flash- mark ereaseblock header dirty on NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @lid: leb id (it's bit number 31 will be set to 0)
+ *
+ * It pulls the CHFS_LID_NOT_DIRTY_BIT to zero on flash.
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_mark_eb_hdr_dirty_flash(struct chfs_ebh *ebh, int pebnr, int lid)
+{
+	int ret;
+	size_t retlen;
+	off_t ofs;
+
+	/* mark leb id dirty */
+	lid = htole32(lid & CHFS_LID_DIRTY_BIT_MASK);
+
+	/* calculate position */
+	ofs = pebnr * ebh->flash_if->erasesize + CHFS_EB_EC_HDR_SIZE
+	    + CHFS_GET_MEMBER_POS(struct chfs_nor_eb_hdr , lid);
+
+	ret = flash_write(ebh->flash_dev, ofs, sizeof(lid), &retlen,
+	    (unsigned char *) &lid);
+	if (ret || retlen != sizeof(lid)) {
+		chfs_err("can't mark peb dirty");
+		return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * nor_invalidate_eb_hdr - invalidate ereaseblock header on NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ *
+ * Sets crc and lip field to zero.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_invalidate_eb_hdr(struct chfs_ebh *ebh, int pebnr)
+{
+	int ret;
+	size_t retlen;
+	off_t ofs;
+	char zero_buf[CHFS_INVALIDATE_SIZE];
+
+	/* fill with zero */
+	memset(zero_buf, 0x0, CHFS_INVALIDATE_SIZE);
+
+	/* calculate position (!!! lid is directly behind crc !!!) */
+	ofs = pebnr * ebh->flash_if->erasesize + CHFS_EB_EC_HDR_SIZE
+	    + CHFS_GET_MEMBER_POS(struct chfs_nor_eb_hdr, crc);
+
+	ret = flash_write(ebh->flash_dev,
+	    ofs, CHFS_INVALIDATE_SIZE, &retlen,
+	    (unsigned char *) &zero_buf);
+	if (ret || retlen != CHFS_INVALIDATE_SIZE) {
+		chfs_err("can't invalidate peb");
+		return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * mark_eb_hdr_free - free ereaseblock header on NOR or NAND flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @ec: erase counter of PEB
+ *
+ * Write out the magic and erase counter to the physical eraseblock.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+mark_eb_hdr_free(struct chfs_ebh *ebh, int pebnr, int ec)
+{
+	int ret, crc;
+	size_t retlen;
+	off_t ofs;
+	struct chfs_eb_hdr *ebhdr;
+	ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP);
+
+	ebhdr->ec_hdr.magic = htole32(CHFS_MAGIC_BITMASK);
+	ebhdr->ec_hdr.erase_cnt = htole32(ec);
+	crc = crc32(0, (uint8_t *) &ebhdr->ec_hdr + 8, 4);
+	ebhdr->ec_hdr.crc_ec = htole32(crc);
+
+	ofs = pebnr * ebh->flash_if->erasesize;
+
+	KASSERT(sizeof(ebhdr->ec_hdr) == CHFS_EB_EC_HDR_SIZE);
+
+	ret = flash_write(ebh->flash_dev,
+	    ofs, CHFS_EB_EC_HDR_SIZE, &retlen,
+	    (unsigned char *) &ebhdr->ec_hdr);
+
+	if (ret || retlen != CHFS_EB_EC_HDR_SIZE) {
+		chfs_err("can't mark peb as free: %d\n", pebnr);
+		kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+		return ret;
+	}
+
+	kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+	return 0;
+}
+
+/*****************************************************************************/
+/* End of Flash specific operations					     */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Lock Tree								     */
+/*****************************************************************************/
+
+int
+ltree_entry_cmp(struct chfs_ltree_entry *le1,
+    struct chfs_ltree_entry *le2)
+{
+	return (le1->lnr - le2->lnr);
+}
+
+/* Generate functions for Lock tree's red-black tree */
+RB_PROTOTYPE( ltree_rbtree, chfs_ltree_entry, rb, ltree_entry_cmp);
+RB_GENERATE( ltree_rbtree, chfs_ltree_entry, rb, ltree_entry_cmp);
+
+
+/**
+ * ltree_lookup - looks up a logical eraseblock in the lock tree
+ * @ebh: chfs eraseblock handler
+ * @lid: identifier of the logical eraseblock
+ *
+ * This function returns a pointer to the wanted &struct chfs_ltree_entry
+ * if the logical eraseblock is in the lock tree, so it is locked, NULL
+ * otherwise.
+ * @ebh->ltree_lock has to be locked!
+ */
+static struct chfs_ltree_entry *
+ltree_lookup(struct chfs_ebh *ebh, int lnr)
+{
+	struct chfs_ltree_entry le, *result;
+	le.lnr = lnr;
+	result = RB_FIND(ltree_rbtree, &ebh->ltree, &le);
+	return result;
+}
+
+/**
+ * ltree_add_entry - add an entry to the lock tree
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * This function adds a new logical eraseblock entry identified with @lnr to the
+ * lock tree. If the entry is already in the tree, it increases the user
+ * counter.
+ * Returns NULL if can not allocate memory for lock tree entry, or a pointer
+ * to the inserted entry otherwise.
+ */
+static struct chfs_ltree_entry *
+ltree_add_entry(struct chfs_ebh *ebh, int lnr)
+{
+	struct chfs_ltree_entry *le, *result;
+
+	le = kmem_alloc(sizeof(struct chfs_ltree_entry), KM_SLEEP);
+
+	le->lnr = lnr;
+	le->users = 1;
+	rw_init(&le->mutex);
+
+	//dbg_ebh("enter ltree lock\n");
+	mutex_enter(&ebh->ltree_lock);
+	//dbg_ebh("insert\n");
+	result = RB_INSERT(ltree_rbtree, &ebh->ltree, le);
+	//dbg_ebh("inserted\n");
+	if (result) {
+		//The entry is already in the tree
+		result->users++;
+		kmem_free(le, sizeof(struct chfs_ltree_entry));
+	}
+	else {
+		result = le;
+	}
+	mutex_exit(&ebh->ltree_lock);
+
+	return result;
+}
+
+/**
+ * leb_read_lock - lock a logical eraseblock for read
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+static int
+leb_read_lock(struct chfs_ebh *ebh, int lnr)
+{
+	struct chfs_ltree_entry *le;
+
+	le = ltree_add_entry(ebh, lnr);
+	if (!le)
+		return ENOMEM;
+
+	rw_enter(&le->mutex, RW_READER);
+	return 0;
+}
+
+/**
+ * leb_read_unlock - unlock a logical eraseblock from read
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * This function unlocks a logical eraseblock from read and delete it from the
+ * lock tree is there are no more users of it.
+ */
+static void
+leb_read_unlock(struct chfs_ebh *ebh, int lnr)
+{
+	struct chfs_ltree_entry *le;
+
+	mutex_enter(&ebh->ltree_lock);
+	//dbg_ebh("LOCK: ebh->ltree_lock spin locked in leb_read_unlock()\n");
+	le = ltree_lookup(ebh, lnr);
+	if (!le)
+		goto out;
+
+	le->users -= 1;
+	KASSERT(le->users >= 0);
+	rw_exit(&le->mutex);
+	if (le->users == 0) {
+		le = RB_REMOVE(ltree_rbtree, &ebh->ltree, le);
+		if (le) {
+			KASSERT(!rw_lock_held(&le->mutex));
+			rw_destroy(&le->mutex);
+
+			kmem_free(le, sizeof(struct chfs_ltree_entry));
+		}
+	}
+
+out:
+	mutex_exit(&ebh->ltree_lock);
+	//dbg_ebh("UNLOCK: ebh->ltree_lock spin unlocked in leb_read_unlock()\n");
+}
+
+/**
+ * leb_write_lock - lock a logical eraseblock for write
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+static int
+leb_write_lock(struct chfs_ebh *ebh, int lnr)
+{
+	struct chfs_ltree_entry *le;
+
+	le = ltree_add_entry(ebh, lnr);
+	if (!le)
+		return ENOMEM;
+
+	rw_enter(&le->mutex, RW_WRITER);
+	return 0;
+}
+
+/**
+ * leb_write_unlock - unlock a logical eraseblock from write
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * This function unlocks a logical eraseblock from write and delete it from the
+ * lock tree is there are no more users of it.
+ */
+static void
+leb_write_unlock(struct chfs_ebh *ebh, int lnr)
+{
+	struct chfs_ltree_entry *le;
+
+	mutex_enter(&ebh->ltree_lock);
+	//dbg_ebh("LOCK: ebh->ltree_lock spin locked in leb_write_unlock()\n");
+	le = ltree_lookup(ebh, lnr);
+	if (!le)
+		goto out;
+
+	le->users -= 1;
+	KASSERT(le->users >= 0);
+	rw_exit(&le->mutex);
+	if (le->users == 0) {
+		RB_REMOVE(ltree_rbtree, &ebh->ltree, le);
+
+		KASSERT(!rw_lock_held(&le->mutex));
+		rw_destroy(&le->mutex);
+
+		kmem_free(le, sizeof(struct chfs_ltree_entry));
+	}
+
+out:
+	mutex_exit(&ebh->ltree_lock);
+	//dbg_ebh("UNLOCK: ebh->ltree_lock spin unlocked in leb_write_unlock()\n");
+}
+
+/*****************************************************************************/
+/* End of Lock Tree							     */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Erase related operations						     */
+/*****************************************************************************/
+
+/**
+ * If the first argument is smaller than the second, the function
+ * returns a value smaller than zero. If they are equal, the function re-
+ * turns zero. Otherwise, it should return a value greater than zero.
+ */
+int
+peb_in_use_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2)
+{
+	return (peb1->pebnr - peb2->pebnr);
+}
+
+int
+peb_free_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2)
+{
+	int comp;
+
+	comp = peb1->erase_cnt - peb2->erase_cnt;
+	if (0 == comp)
+		comp = peb1->pebnr - peb2->pebnr;
+
+	return comp;
+}
+
+/* Generate functions for in use PEB's red-black tree */
+RB_PROTOTYPE(peb_in_use_rbtree, chfs_peb, u.rb, peb_in_use_cmp);
+RB_GENERATE(peb_in_use_rbtree, chfs_peb, u.rb, peb_in_use_cmp);
+RB_PROTOTYPE(peb_free_rbtree, chfs_peb, u.rb, peb_free_cmp);
+RB_GENERATE(peb_free_rbtree, chfs_peb, u.rb, peb_free_cmp);
+
+/**
+ * add_peb_to_erase_queue: adds a PEB to to_erase/fully_erased queue
+ * @ebh - chfs eraseblock handler
+ * @pebnr - physical eraseblock's number
+ * @ec - erase counter of PEB
+ * @queue: the queue to add to
+ *
+ * This function adds a PEB to the erase queue specified by @queue.
+ * The @ebh->erase_lock must be locked before using this.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+add_peb_to_erase_queue(struct chfs_ebh *ebh, int pebnr, int ec,
+    struct peb_queue *queue)
+{
+	struct chfs_peb *peb;
+
+	peb = kmem_alloc(sizeof(struct chfs_peb), KM_SLEEP);
+
+	peb->erase_cnt = ec;
+	peb->pebnr = pebnr;
+
+	TAILQ_INSERT_TAIL(queue, peb, u.queue);
+
+	return 0;
+
+}
+//TODO
+/**
+ * find_peb_in_use - looks up a PEB in the RB-tree of used blocks
+ * @ebh - chfs eraseblock handler
+ *
+ * This function returns a pointer to the PEB found in the tree,
+ * NULL otherwise.
+ * The @ebh->erase_lock must be locked before using this.
+ */
+struct chfs_peb *
+find_peb_in_use(struct chfs_ebh *ebh, int pebnr)
+{
+	struct chfs_peb peb, *result;
+	peb.pebnr = pebnr;
+	result = RB_FIND(peb_in_use_rbtree, &ebh->in_use, &peb);
+	return result;
+}
+
+/**
+ * add_peb_to_free - adds a PEB to the RB-tree of free PEBs
+ * @ebh - chfs eraseblock handler
+ * @pebnr - physical eraseblock's number
+ * @ec - erase counter of PEB
+ *
+ *
+ * This function adds a physical eraseblock to the RB-tree of free PEBs
+ * stored in the @ebh. The key is the erase counter and pebnr.
+ * The @ebh->erase_lock must be locked before using this.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+add_peb_to_free(struct chfs_ebh *ebh, int pebnr, int ec)
+{
+	struct chfs_peb *peb, *result;
+
+	peb = kmem_alloc(sizeof(struct chfs_peb), KM_SLEEP);
+
+	peb->erase_cnt = ec;
+	peb->pebnr = pebnr;
+	result = RB_INSERT(peb_free_rbtree, &ebh->free, peb);
+	if (result)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * add_peb_to_in_use - adds a PEB to the RB-tree of used PEBs
+ * @ebh - chfs eraseblock handler
+ * @pebnr - physical eraseblock's number
+ * @ec - erase counter of PEB
+ *
+ *
+ * This function adds a physical eraseblock to the RB-tree of used PEBs
+ * stored in the @ebh. The key is pebnr.
+ * The @ebh->erase_lock must be locked before using this.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+add_peb_to_in_use(struct chfs_ebh *ebh, int pebnr, int ec)
+{
+	struct chfs_peb *peb, *result;
+
+	peb = kmem_alloc(sizeof(struct chfs_peb), KM_SLEEP);
+
+	peb->erase_cnt = ec;
+	peb->pebnr = pebnr;
+	result = RB_INSERT(peb_in_use_rbtree, &ebh->in_use, peb);
+	if (result)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * erase_callback - callback function for flash erase
+ * @ei: erase information
+ */
+void
+erase_callback(struct flash_erase_instruction *ei)
+{
+	int err;
+	struct chfs_erase_info_priv *priv = (void *) ei->ei_priv;
+	//dbg_ebh("ERASE_CALLBACK() CALLED\n");
+	struct chfs_ebh *ebh = priv->ebh;
+	struct chfs_peb *peb = priv->peb;
+
+	peb->erase_cnt += 1;
+
+	if (ei->ei_state == FLASH_ERASE_DONE) {
+
+		/* Write out erase counter */
+		err = ebh->ops->mark_eb_hdr_free(ebh,
+		    peb->pebnr, peb->erase_cnt);
+		if (err) {
+			/* cannot mark PEB as free,so erase it again */
+			chfs_err(
+				"cannot mark eraseblock as free, PEB: %d\n",
+				peb->pebnr);
+			mutex_enter(&ebh->erase_lock);
+			/*dbg_ebh("LOCK: ebh->erase_lock spin locked in erase_callback() "
+			  "after mark ebhdr free\n");*/
+			add_peb_to_erase_queue(ebh, peb->pebnr, peb->erase_cnt,
+			    &ebh->to_erase);
+			mutex_exit(&ebh->erase_lock);
+			/*dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_callback() "
+			  "after mark ebhdr free\n");*/
+			kmem_free(peb, sizeof(struct chfs_peb));
+			return;
+		}
+
+		mutex_enter(&ebh->erase_lock);
+		/*dbg_ebh("LOCK: ebh->erase_lock spin locked in erase_callback()\n");*/
+		err = add_peb_to_free(ebh, peb->pebnr, peb->erase_cnt);
+		mutex_exit(&ebh->erase_lock);
+		/*dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_callback()\n");*/
+		kmem_free(peb, sizeof(struct chfs_peb));
+	} else {
+		/*
+		 * Erase is finished, but there was a problem,
+		 * so erase PEB again
+		 */
+		chfs_err("erase failed, state is: 0x%x\n", ei->ei_state);
+		add_peb_to_erase_queue(ebh, peb->pebnr, peb->erase_cnt, &ebh->to_erase);
+		kmem_free(peb, sizeof(struct chfs_peb));
+	}
+}
+
+/**
+ * free_peb: free a PEB
+ * @ebh: chfs eraseblock handler
+ *
+ * This function erases the first physical eraseblock from one of the erase
+ * lists and adds to the RB-tree of free PEBs.
+ * Returns zero in case of succes, error code in case of fail.
+ */
+int
+free_peb(struct chfs_ebh *ebh)
+{
+	int err, retries = 0;
+	off_t ofs;
+	struct chfs_peb *peb = NULL;
+	struct flash_erase_instruction *ei;
+
+	KASSERT(mutex_owned(&ebh->erase_lock));
+
+	if (!TAILQ_EMPTY(&ebh->fully_erased)) {
+		//dbg_ebh("[FREE PEB] got a fully erased block\n");
+		peb = TAILQ_FIRST(&ebh->fully_erased);
+		TAILQ_REMOVE(&ebh->fully_erased, peb, u.queue);
+		err = ebh->ops->mark_eb_hdr_free(ebh,
+		    peb->pebnr, peb->erase_cnt);
+		if (err) {
+			goto out_free;
+		}
+		err = add_peb_to_free(ebh, peb->pebnr, peb->erase_cnt);
+		goto out_free;
+	}
+	/* Erase PEB */
+	//dbg_ebh("[FREE PEB] eraseing a block\n");
+	peb = TAILQ_FIRST(&ebh->to_erase);
+	TAILQ_REMOVE(&ebh->to_erase, peb, u.queue);
+	mutex_exit(&ebh->erase_lock);
+	//dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in free_peb()\n");
+	ofs = peb->pebnr * ebh->flash_if->erasesize;
+
+	/* XXX where do we free this? */
+	ei = kmem_alloc(sizeof(struct flash_erase_instruction)
+	    + sizeof(struct chfs_erase_info_priv), KM_SLEEP);
+retry:
+	memset(ei, 0, sizeof(*ei));
+
+//	ei->ei_if = ebh->flash_if;
+	ei->ei_addr = ofs;
+	ei->ei_len = ebh->flash_if->erasesize;
+	ei->ei_callback = erase_callback;
+	ei->ei_priv = (unsigned long) (&ei[1]);
+
+	((struct chfs_erase_info_priv *) ei->ei_priv)->ebh = ebh;
+	((struct chfs_erase_info_priv *) ei->ei_priv)->peb = peb;
+
+	err = flash_erase(ebh->flash_dev, ei);
+	dbg_ebh("erased peb: %d\n", peb->pebnr);
+
+	/* einval would mean we did something wrong */
+	KASSERT(err != EINVAL);
+
+	if (err) {
+		dbg_ebh("errno: %d, ei->ei_state: %d\n", err, ei->ei_state);
+		if (CHFS_MAX_GET_PEB_RETRIES < ++retries &&
+		    ei->ei_state == FLASH_ERASE_FAILED) {
+			/* The block went bad mark it */
+			dbg_ebh("ebh markbad! 0x%jx\n", (uintmax_t )ofs);
+			err = flash_block_markbad(ebh->flash_dev, ofs);
+			if (!err) {
+				ebh->peb_nr--;
+			}
+
+			goto out;
+		}
+		chfs_err("can not erase PEB: %d, try again\n", peb->pebnr);
+		goto retry;
+	}
+
+out:
+	/* lock the erase_lock, because it was locked
+	 * when the function was called */
+	mutex_enter(&ebh->erase_lock);
+	return err;
+
+out_free:
+	kmem_free(peb, sizeof(struct chfs_peb));
+	return err;
+}
+
+/**
+ * release_peb - schedule an erase for the PEB
+ * @ebh: chfs eraseblock handler
+ * @pebnr: physical eraseblock number
+ *
+ * This function get the peb identified by @pebnr from the in_use RB-tree of
+ * @ebh, removes it and schedule an erase for it.
+ *
+ * Returns zero on success, error code in case of fail.
+ */
+int
+release_peb(struct chfs_ebh *ebh, int pebnr)
+{
+	int err = 0;
+	struct chfs_peb *peb;
+
+	mutex_enter(&ebh->erase_lock);
+
+	//dbg_ebh("LOCK: ebh->erase_lock spin locked in release_peb()\n");
+	peb = find_peb_in_use(ebh, pebnr);
+	if (!peb) {
+		chfs_err("LEB is mapped, but is not in the 'in_use' "
+		    "tree of ebh\n");
+		goto out_unlock;
+	}
+	err = add_peb_to_erase_queue(ebh, peb->pebnr, peb->erase_cnt,
+	    &ebh->to_erase);
+
+	if (err)
+		goto out_unlock;
+
+	RB_REMOVE(peb_in_use_rbtree, &ebh->in_use, peb);
+out_unlock:
+	mutex_exit(&ebh->erase_lock);
+	//dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in release_peb()"
+	//		" at out_unlock\n");
+	return err;
+}
+
+/**
+ * erase_thread - background thread for erasing PEBs
+ * @data: pointer to the eraseblock handler
+ */
+/*void
+  erase_thread(void *data)
+  {
+  struct chfs_ebh *ebh = data;
+
+  dbg_ebh("erase thread started\n");
+  while (ebh->bg_erase.eth_running) {
+  int err;
+
+  mutex_enter(&ebh->erase_lock);
+  dbg_ebh("LOCK: ebh->erase_lock spin locked in erase_thread()\n");
+  if (TAILQ_EMPTY(&ebh->to_erase) && TAILQ_EMPTY(&ebh->fully_erased)) {
+  dbg_ebh("thread has nothing to do\n");
+  mutex_exit(&ebh->erase_lock);
+  mutex_enter(&ebh->bg_erase.eth_thread_mtx);
+  cv_timedwait_sig(&ebh->bg_erase.eth_wakeup,
+  &ebh->bg_erase.eth_thread_mtx, mstohz(100));
+  mutex_exit(&ebh->bg_erase.eth_thread_mtx);
+
+  dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_thread()\n");
+  continue;
+  }
+  mutex_exit(&ebh->erase_lock);
+  dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_thread()\n");
+
+  err = free_peb(ebh);
+  if (err)
+  chfs_err("freeing PEB failed in the background thread: %d\n", err);
+
+  }
+  dbg_ebh("erase thread stopped\n");
+  kthread_exit(0);
+  }*/
+
+/**
+ * erase_thread - background thread for erasing PEBs
+ * @data: pointer to the eraseblock handler
+ */
+void
+erase_thread(void *data) {
+	dbg_ebh("[EBH THREAD] erase thread started\n");
+
+	struct chfs_ebh *ebh = data;
+	int err;
+
+	mutex_enter(&ebh->erase_lock);
+	while (ebh->bg_erase.eth_running) {
+		if (TAILQ_EMPTY(&ebh->to_erase) &&
+		    TAILQ_EMPTY(&ebh->fully_erased)) {
+			cv_timedwait_sig(&ebh->bg_erase.eth_wakeup,
+			    &ebh->erase_lock, mstohz(100));
+		} else {
+			/* XXX exiting this mutex is a bit odd here as
+			 * free_peb instantly reenters it...
+			 */
+			err = free_peb(ebh);
+			mutex_exit(&ebh->erase_lock);
+			if (err) {
+				chfs_err("freeing PEB failed in the"
+				    " background thread: %d\n", err);
+			}
+			mutex_enter(&ebh->erase_lock);
+		}
+	}
+	mutex_exit(&ebh->erase_lock);
+
+	dbg_ebh("[EBH THREAD] erase thread stopped\n");
+	kthread_exit(0);
+}
+
+/**
+ * erase_thread_start - init and start erase thread
+ * @ebh: eraseblock handler
+ */
+static void
+erase_thread_start(struct chfs_ebh *ebh)
+{
+	cv_init(&ebh->bg_erase.eth_wakeup, "ebheracv");
+
+	ebh->bg_erase.eth_running = true;
+	kthread_create(PRI_NONE, KTHREAD_MPSAFE | KTHREAD_MUSTJOIN, NULL,
+	    erase_thread, ebh, &ebh->bg_erase.eth_thread, "ebherase");
+}
+
+/**
+ * erase_thread_stop - stop background erase thread
+ * @ebh: eraseblock handler
+ */
+static void
+erase_thread_stop(struct chfs_ebh *ebh)
+{
+	ebh->bg_erase.eth_running = false;
+	cv_signal(&ebh->bg_erase.eth_wakeup);
+	dbg_ebh("[EBH THREAD STOP] signaled\n");
+
+	kthread_join(ebh->bg_erase.eth_thread);
+#ifdef BROKEN_KTH_JOIN
+	kpause("chfsebhjointh", false, mstohz(1000), NULL);
+#endif
+
+	cv_destroy(&ebh->bg_erase.eth_wakeup);
+}
+
+/*****************************************************************************/
+/* End of Erase related operations					     */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Scan related operations						     */
+/*****************************************************************************/
+int
+scan_leb_used_cmp(struct chfs_scan_leb *sleb1, struct chfs_scan_leb *sleb2)
+{
+	return (sleb1->lnr - sleb2->lnr);
+}
+
+RB_PROTOTYPE(scan_leb_used_rbtree, chfs_scan_leb, u.rb, scan_leb_used_cmp);
+RB_GENERATE(scan_leb_used_rbtree, chfs_scan_leb, u.rb, scan_leb_used_cmp);
+
+/**
+ * scan_add_to_queue - adds a physical eraseblock to one of the
+ *                     eraseblock queue
+ * @si: chfs scanning information
+ * @pebnr: physical eraseblock number
+ * @erase_cnt: erase counter of the physical eraseblock
+ * @list: the list to add to
+ *
+ * This function adds a physical eraseblock to one of the lists in the scanning
+ * information.
+ * Returns zero in case of success, negative error code in case of fail.
+ */
+static int
+scan_add_to_queue(struct chfs_scan_info *si, int pebnr, int erase_cnt,
+    struct scan_leb_queue *queue)
+{
+	struct chfs_scan_leb *sleb;
+
+	sleb = kmem_alloc(sizeof(struct chfs_scan_leb), KM_SLEEP);
+
+	sleb->pebnr = pebnr;
+	sleb->erase_cnt = erase_cnt;
+	TAILQ_INSERT_TAIL(queue, sleb, u.queue);
+	return 0;
+}
+
+/*
+ * nor_scan_add_to_used - add a physical eraseblock to the
+ *                        used tree of scan info
+ * @ebh: chfs eraseblock handler
+ * @si: chfs scanning information
+ * @ebhdr: eraseblock header
+ * @pebnr: physical eraseblock number
+ * @leb_status: the status of the PEB's eraseblock header
+ *
+ * This function adds a PEB to the used tree of the scanning information.
+ * It handles the situations if there are more physical eraseblock referencing
+ * to the same logical eraseblock.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+    struct chfs_eb_hdr *ebhdr, int pebnr, int leb_status)
+{
+	int err, lnr, ec;
+	struct chfs_scan_leb *sleb, *old;
+
+	lnr = CHFS_GET_LID(ebhdr->u.nor_hdr.lid);
+	ec = le32toh(ebhdr->ec_hdr.erase_cnt);
+
+	sleb = kmem_alloc(sizeof(struct chfs_scan_leb), KM_SLEEP);
+
+	sleb->erase_cnt = ec;
+	sleb->lnr = lnr;
+	sleb->pebnr = pebnr;
+	sleb->info = leb_status;
+
+	old = RB_INSERT(scan_leb_used_rbtree, &si->used, sleb);
+	if (old) {
+		kmem_free(sleb, sizeof(struct chfs_scan_leb));
+		/* There is already an eraseblock in the used tree */
+		/* If the new one is bad */
+		if (EBHDR_LEB_DIRTY == leb_status &&
+		    EBHDR_LEB_OK == old->info) {
+			return scan_add_to_queue(si, pebnr, ec, &si->erase);
+		} else {
+			err = scan_add_to_queue(si, old->pebnr,
+			    old->erase_cnt, &si->erase);
+			if (err) {
+				return err;
+			}
+
+			old->erase_cnt = ec;
+			old->lnr = lnr;
+			old->pebnr = pebnr;
+			old->info = leb_status;
+			return 0;
+		}
+	}
+	return 0;
+}
+
+/**
+ * nor_process eb -read the headers from NOR flash, check them and add to
+ * 				   the scanning information
+ * @ebh: chfs eraseblock handler
+ * @si: chfs scanning information
+ * @pebnr: physical eraseblock number
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+    int pebnr, struct chfs_eb_hdr *ebhdr)
+{
+	int err, erase_cnt, leb_status;
+
+	err = ebh->ops->read_eb_hdr(ebh, pebnr, ebhdr);
+	if (err)
+		return err;
+
+	erase_cnt = le32toh(ebhdr->ec_hdr.erase_cnt);
+	dbg_ebh("erase_cnt: %d\n", erase_cnt);
+	leb_status = ebh->ops->check_eb_hdr(ebh, ebhdr);
+	if (EBHDR_LEB_BADMAGIC == leb_status ||
+	    EBHDR_LEB_BADCRC == leb_status) {
+		err = scan_add_to_queue(si, pebnr, erase_cnt, &si->corrupted);
+		return err;
+	}
+	else if (EBHDR_LEB_FREE == leb_status) {
+		err = scan_add_to_queue(si, pebnr, erase_cnt, &si->free);
+		goto count_mean;
+	}
+	else if (EBHDR_LEB_NO_HDR == leb_status) {
+		err = scan_add_to_queue(si, pebnr, erase_cnt, &si->erased);
+		return err;
+	}
+	else if (EBHDR_LEB_INVALIDATED == leb_status) {
+		err = scan_add_to_queue(si, pebnr, erase_cnt, &si->erase);
+		return err;
+	}
+
+	err = nor_scan_add_to_used(ebh, si, ebhdr, pebnr, leb_status);
+	if (err)
+		return err;
+
+
+count_mean:
+	si->sum_of_ec += erase_cnt;
+	si->num_of_eb++;
+
+	return err;
+}
+
+/*
+ * nand_scan_add_to_used - add a physical eraseblock to the
+ *                         used tree of scan info
+ * @ebh: chfs eraseblock handler
+ * @si: chfs scanning information
+ * @ebhdr: eraseblock header
+ * @pebnr: physical eraseblock number
+ * @leb_status: the status of the PEB's eraseblock header
+ *
+ * This function adds a PEB to the used tree of the scanning information.
+ * It handles the situations if there are more physical eraseblock referencing
+ * to the same logical eraseblock.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nand_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+    struct chfs_eb_hdr *ebhdr, int pebnr)
+{
+	int err, lnr, ec;
+	struct chfs_scan_leb *sleb, *old;
+	uint64_t serial = le64toh(ebhdr->u.nand_hdr.serial);
+
+	lnr = CHFS_GET_LID(ebhdr->u.nor_hdr.lid);
+	ec = le32toh(ebhdr->ec_hdr.erase_cnt);
+
+	sleb = kmem_alloc(sizeof(struct chfs_scan_leb), KM_SLEEP);
+
+	sleb->erase_cnt = ec;
+	sleb->lnr = lnr;
+	sleb->pebnr = pebnr;
+	sleb->info = serial;
+
+	old = RB_INSERT(scan_leb_used_rbtree, &si->used, sleb);
+	if (old) {
+		kmem_free(sleb, sizeof(struct chfs_scan_leb));
+		/* There is already an eraseblock in the used tree */
+		/* If the new one is bad */
+		if (serial < old->info)
+			return scan_add_to_queue(si, pebnr, ec, &si->erase);
+		else {
+			err = scan_add_to_queue(si,
+			    old->pebnr, old->erase_cnt, &si->erase);
+			if (err)
+				return err;
+
+			old->erase_cnt = ec;
+			old->lnr = lnr;
+			old->pebnr = pebnr;
+			old->info = serial;
+			return 0;
+		}
+	}
+	return 0;
+}
+
+/**
+ * nand_process eb -read the headers from NAND flash, check them and add to the
+ * 					scanning information
+ * @ebh: chfs eraseblock handler
+ * @si: chfs scanning information
+ * @pebnr: physical eraseblock number
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nand_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+    int pebnr, struct chfs_eb_hdr *ebhdr)
+{
+	int err, erase_cnt, leb_status;
+	uint64_t max_serial;
+	/* isbad() is defined on some ancient platforms, heh */
+	bool is_bad;
+
+	/* Check block is bad */
+	err = flash_block_isbad(ebh->flash_dev,
+	    pebnr * ebh->flash_if->erasesize, &is_bad);
+	if (err) {
+		chfs_err("checking block is bad failed\n");
+		return err;
+	}
+	if (is_bad) {
+		si->bad_peb_cnt++;
+		return 0;
+	}
+
+	err = ebh->ops->read_eb_hdr(ebh, pebnr, ebhdr);
+	if (err)
+		return err;
+
+	erase_cnt = le32toh(ebhdr->ec_hdr.erase_cnt);
+	leb_status = ebh->ops->check_eb_hdr(ebh, ebhdr);
+	if (EBHDR_LEB_BADMAGIC == leb_status ||
+	    EBHDR_LEB_BADCRC == leb_status) {
+		err = scan_add_to_queue(si, pebnr, erase_cnt, &si->corrupted);
+		return err;
+	}
+	else if (EBHDR_LEB_FREE == leb_status) {
+		err = scan_add_to_queue(si, pebnr, erase_cnt, &si->free);
+		goto count_mean;
+	}
+	else if (EBHDR_LEB_NO_HDR == leb_status) {
+		err = scan_add_to_queue(si, pebnr, erase_cnt, &si->erased);
+		return err;
+	}
+
+	err = nand_scan_add_to_used(ebh, si, ebhdr, pebnr);
+	if (err)
+		return err;
+
+	max_serial = le64toh(ebhdr->u.nand_hdr.serial);
+	if (max_serial > *ebh->max_serial) {
+		*ebh->max_serial = max_serial;
+	}
+
+count_mean:
+	si->sum_of_ec += erase_cnt;
+	si->num_of_eb++;
+
+	return err;
+}
+
+/**
+ * chfs_scan - scans the media and returns informations about it
+ * @ebh: chfs eraseblock handler
+ *
+ * This function scans through the media and returns information about it or if
+ * it fails NULL will be returned.
+ */
+struct chfs_scan_info *
+chfs_scan(struct chfs_ebh *ebh)
+{
+	struct chfs_scan_info *si;
+	struct chfs_eb_hdr *ebhdr;
+	int pebnr, err;
+
+	si = kmem_alloc(sizeof(*si), KM_SLEEP);
+
+	TAILQ_INIT(&si->corrupted);
+	TAILQ_INIT(&si->free);
+	TAILQ_INIT(&si->erase);
+	TAILQ_INIT(&si->erased);
+	RB_INIT(&si->used);
+	si->bad_peb_cnt = 0;
+	si->num_of_eb = 0;
+	si->sum_of_ec = 0;
+
+	ebhdr = kmem_alloc(sizeof(*ebhdr), KM_SLEEP);
+
+	for (pebnr = 0; pebnr < ebh->peb_nr; pebnr++) {
+		dbg_ebh("processing PEB %d\n", pebnr);
+		err = ebh->ops->process_eb(ebh, si, pebnr, ebhdr);
+		if (err < 0)
+			goto out_ebhdr;
+	}
+	kmem_free(ebhdr, sizeof(*ebhdr));
+	dbg_ebh("[CHFS_SCAN] scanning information collected\n");
+	return si;
+
+out_ebhdr:
+	kmem_free(ebhdr, sizeof(*ebhdr));
+	kmem_free(si, sizeof(*si));
+	return NULL;
+}
+
+/**
+ * scan_info_destroy - frees all lists and trees in the scanning information
+ * @si: the scanning information
+ */
+void
+scan_info_destroy(struct chfs_scan_info *si)
+{
+	EBH_QUEUE_DESTROY(&si->corrupted,
+	    struct chfs_scan_leb, u.queue);
+
+	EBH_QUEUE_DESTROY(&si->erase,
+	    struct chfs_scan_leb, u.queue);
+
+	EBH_QUEUE_DESTROY(&si->erased,
+	    struct chfs_scan_leb, u.queue);
+
+	EBH_QUEUE_DESTROY(&si->free,
+	    struct chfs_scan_leb, u.queue);
+
+	EBH_TREE_DESTROY(scan_leb_used_rbtree,
+	    &si->used, struct chfs_scan_leb);
+
+	kmem_free(si, sizeof(*si));
+	dbg_ebh("[SCAN_INFO_DESTROY] scanning information destroyed\n");
+}
+
+/**
+ * scan_media - scan media
+ *
+ * @ebh - chfs eraseblock handler
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+
+int
+scan_media(struct chfs_ebh *ebh)
+{
+	int err, i, avg_ec;
+	struct chfs_scan_info *si;
+	struct chfs_scan_leb *sleb;
+
+	si = chfs_scan(ebh);
+	/*
+	 * Process the scan info, manage the eraseblock lists
+	 */
+	mutex_init(&ebh->ltree_lock, MUTEX_DEFAULT, IPL_NONE);
+	mutex_init(&ebh->erase_lock, MUTEX_DEFAULT, IPL_NONE);
+	RB_INIT(&ebh->ltree);
+	RB_INIT(&ebh->free);
+	RB_INIT(&ebh->in_use);
+	TAILQ_INIT(&ebh->to_erase);
+	TAILQ_INIT(&ebh->fully_erased);
+	mutex_init(&ebh->alc_mutex, MUTEX_DEFAULT, IPL_NONE);
+
+	ebh->peb_nr -= si->bad_peb_cnt;
+
+	/*
+	 * Create background thread for erasing
+	 */
+	erase_thread_start(ebh);
+
+	ebh->lmap = kmem_alloc(ebh->peb_nr * sizeof(int), KM_SLEEP);
+
+	for (i = 0; i < ebh->peb_nr; i++) {
+		ebh->lmap[i] = EBH_LEB_UNMAPPED;
+	}
+
+	if (si->num_of_eb == 0) {
+		/* The flash contains no data. */
+		avg_ec = 0;
+	}
+	else {
+		avg_ec = (int) (si->sum_of_ec / si->num_of_eb);
+	}
+	dbg_ebh("num_of_eb: %d\n", si->num_of_eb);
+
+	mutex_enter(&ebh->erase_lock);
+
+	RB_FOREACH(sleb, scan_leb_used_rbtree, &si->used) {
+		ebh->lmap[sleb->lnr] = sleb->pebnr;
+		err = add_peb_to_in_use(ebh, sleb->pebnr, sleb->erase_cnt);
+		if (err)
+			goto out_free;
+	}
+
+	TAILQ_FOREACH(sleb, &si->erased, u.queue) {
+		err = add_peb_to_erase_queue(ebh, sleb->pebnr, avg_ec,
+		    &ebh->fully_erased);
+		if (err)
+			goto out_free;
+	}
+
+	TAILQ_FOREACH(sleb, &si->erase, u.queue) {
+		err = add_peb_to_erase_queue(ebh, sleb->pebnr, avg_ec,
+		    &ebh->to_erase);
+		if (err)
+			goto out_free;
+	}
+
+	TAILQ_FOREACH(sleb, &si->free, u.queue) {
+		err = add_peb_to_free(ebh, sleb->pebnr, sleb->erase_cnt);
+		if (err)
+			goto out_free;
+	}
+
+	TAILQ_FOREACH(sleb, &si->corrupted, u.queue) {
+		err = add_peb_to_erase_queue(ebh, sleb->pebnr, avg_ec,
+		    &ebh->to_erase);
+		if (err)
+			goto out_free;
+	}
+	mutex_exit(&ebh->erase_lock);
+	scan_info_destroy(si);
+	return 0;
+
+out_free:
+	mutex_exit(&ebh->erase_lock);
+	kmem_free(ebh->lmap, ebh->peb_nr * sizeof(int));
+	scan_info_destroy(si);
+	dbg_ebh("[SCAN_MEDIA] returning with error: %d\n", err);
+	return err;
+}
+
+/*****************************************************************************/
+/* End of Scan related operations					     */
+/*****************************************************************************/
+
+/**
+ * ebh_open - opens mtd device and init ereaseblock header
+ * @ebh: eraseblock handler
+ * @flash_nr: flash device number to use
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_open(struct chfs_ebh *ebh, dev_t dev)
+{
+	int err;
+
+	ebh->flash_dev = flash_get_device(dev);
+	if (!ebh->flash_dev) {
+		aprint_error("ebh_open: cant get flash device\n");
+		return ENODEV;
+	}
+
+	ebh->flash_if = flash_get_interface(dev);
+	if (!ebh->flash_if) {
+		aprint_error("ebh_open: cant get flash interface\n");
+		return ENODEV;
+	}
+
+	ebh->flash_size = flash_get_size(dev);
+	ebh->peb_nr = ebh->flash_size / ebh->flash_if->erasesize;
+//	ebh->peb_nr = ebh->flash_if->size / ebh->flash_if->erasesize;
+	/* Set up flash operations based on flash type */
+	ebh->ops = kmem_alloc(sizeof(struct chfs_ebh_ops), KM_SLEEP);
+
+	switch (ebh->flash_if->type) {
+	case FLASH_TYPE_NOR:
+		ebh->eb_size = ebh->flash_if->erasesize -
+		    CHFS_EB_EC_HDR_SIZE - CHFS_EB_HDR_NOR_SIZE;
+
+		ebh->ops->read_eb_hdr = nor_read_eb_hdr;
+		ebh->ops->write_eb_hdr = nor_write_eb_hdr;
+		ebh->ops->check_eb_hdr = nor_check_eb_hdr;
+		ebh->ops->mark_eb_hdr_dirty_flash =
+		    nor_mark_eb_hdr_dirty_flash;
+		ebh->ops->invalidate_eb_hdr = nor_invalidate_eb_hdr;
+		ebh->ops->mark_eb_hdr_free = mark_eb_hdr_free;
+
+		ebh->ops->process_eb = nor_process_eb;
+
+		ebh->ops->create_eb_hdr = nor_create_eb_hdr;
+		ebh->ops->calc_data_offs = nor_calc_data_offs;
+
+		ebh->max_serial = NULL;
+		break;
+	case FLASH_TYPE_NAND:
+		ebh->eb_size = ebh->flash_if->erasesize -
+		    2 * ebh->flash_if->page_size;
+
+		ebh->ops->read_eb_hdr = nand_read_eb_hdr;
+		ebh->ops->write_eb_hdr = nand_write_eb_hdr;
+		ebh->ops->check_eb_hdr = nand_check_eb_hdr;
+		ebh->ops->mark_eb_hdr_free = mark_eb_hdr_free;
+		ebh->ops->mark_eb_hdr_dirty_flash = NULL;
+		ebh->ops->invalidate_eb_hdr = NULL;
+
+		ebh->ops->process_eb = nand_process_eb;
+
+		ebh->ops->create_eb_hdr = nand_create_eb_hdr;
+		ebh->ops->calc_data_offs = nand_calc_data_offs;
+
+		ebh->max_serial = kmem_alloc(sizeof(uint64_t), KM_SLEEP);
+
+		*ebh->max_serial = 0;
+		break;
+	default:
+		return 1;
+	}
+	printf("opening ebh: eb_size: %zu\n", ebh->eb_size);
+	err = scan_media(ebh);
+	if (err) {
+		dbg_ebh("Scan failed.");
+		kmem_free(ebh->ops, sizeof(struct chfs_ebh_ops));
+		kmem_free(ebh, sizeof(struct chfs_ebh));
+		return err;
+	}
+	return 0;
+}
+
+/**
+ * ebh_close - close ebh
+ * @ebh: eraseblock handler
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_close(struct chfs_ebh *ebh)
+{
+	erase_thread_stop(ebh);
+
+	EBH_TREE_DESTROY(peb_free_rbtree, &ebh->free, struct chfs_peb);
+	EBH_TREE_DESTROY(peb_in_use_rbtree, &ebh->in_use, struct chfs_peb);
+
+	EBH_QUEUE_DESTROY(&ebh->fully_erased, struct chfs_peb, u.queue);
+	EBH_QUEUE_DESTROY(&ebh->to_erase, struct chfs_peb, u.queue);
+
+	/* XXX HACK, see ebh.h */
+	EBH_TREE_DESTROY_MUTEX(ltree_rbtree, &ebh->ltree,
+	    struct chfs_ltree_entry);
+
+	KASSERT(!mutex_owned(&ebh->ltree_lock));
+	KASSERT(!mutex_owned(&ebh->alc_mutex));
+	KASSERT(!mutex_owned(&ebh->erase_lock));
+
+	mutex_destroy(&ebh->ltree_lock);
+	mutex_destroy(&ebh->alc_mutex);
+	mutex_destroy(&ebh->erase_lock);
+
+	kmem_free(ebh->ops, sizeof(struct chfs_ebh_ops));
+	kmem_free(ebh, sizeof(struct chfs_ebh));
+
+	return 0;
+}
+
+/**
+ * ebh_read_leb - read data from leb
+ * @ebh: eraseblock handler
+ * @lnr: logical eraseblock number
+ * @buf: buffer to read to
+ * @offset: offset from where to read
+ * @len: bytes number to read
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_read_leb(struct chfs_ebh *ebh, int lnr, char *buf, uint32_t offset,
+    size_t len, size_t *retlen)
+{
+	int err, pebnr;
+	off_t data_offset;
+
+	KASSERT(offset + len <= ebh->eb_size);
+
+	err = leb_read_lock(ebh, lnr);
+	if (err)
+		return err;
+	pebnr = ebh->lmap[lnr];
+	/* If PEB is not mapped the buffer is filled with 0xFF */
+	if (EBH_LEB_UNMAPPED == pebnr) {
+		leb_read_unlock(ebh, lnr);
+		memset(buf, 0xFF, len);
+		return 0;
+	}
+
+	/* Read data */
+	data_offset = ebh->ops->calc_data_offs(ebh, pebnr, offset);
+	err = flash_read(ebh->flash_dev, data_offset, len, retlen,
+	    (unsigned char *) buf);
+	if (err)
+		goto out_free;
+
+	KASSERT(len == *retlen);
+
+	leb_read_unlock(ebh, lnr);
+	return err;
+
+out_free:
+	leb_read_unlock(ebh, lnr);
+	return err;
+}
+
+/**
+ * get_peb: get a free physical eraseblock
+ * @ebh - chfs eraseblock handler
+ *
+ * This function gets a free eraseblock from the ebh->free RB-tree.
+ * The fist entry will be returned and deleted from the tree.
+ * The entries sorted by the erase counters, so the PEB with the smallest
+ * erase counter will be added back.
+ * If something goes bad a negative value will be returned.
+ */
+int
+get_peb(struct chfs_ebh *ebh)
+{
+	int err, pebnr;
+	struct chfs_peb *peb;
+
+retry:
+	mutex_enter(&ebh->erase_lock);
+	//dbg_ebh("LOCK: ebh->erase_lock spin locked in get_peb()\n");
+	if (RB_EMPTY(&ebh->free)) {
+		/*There is no more free PEBs in the tree*/
+		if (TAILQ_EMPTY(&ebh->to_erase) &&
+		    TAILQ_EMPTY(&ebh->fully_erased)) {
+			mutex_exit(&ebh->erase_lock);
+			//dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in get_peb()\n");
+			return ENOSPC;
+		}
+		err = free_peb(ebh);
+
+		mutex_exit(&ebh->erase_lock);
+		//dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in get_peb()\n");
+
+		if (err)
+			return err;
+		goto retry;
+	}
+	peb = RB_MIN(peb_free_rbtree, &ebh->free);
+	pebnr = peb->pebnr;
+	RB_REMOVE(peb_free_rbtree, &ebh->free, peb);
+	err = add_peb_to_in_use(ebh, peb->pebnr, peb->erase_cnt);
+	if (err)
+		pebnr = err;
+
+	kmem_free(peb, sizeof(struct chfs_peb));
+
+	mutex_exit(&ebh->erase_lock);
+	//dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in get_peb()\n");
+
+	return pebnr;
+}
+
+/**
+ * ebh_write_leb - write data to leb
+ * @ebh: eraseblock handler
+ * @lnr: logical eraseblock number
+ * @buf: data to write
+ * @offset: offset where to write
+ * @len: bytes number to write
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_write_leb(struct chfs_ebh *ebh, int lnr, char *buf, uint32_t offset,
+    size_t len, size_t *retlen)
+{
+	int err, pebnr, retries = 0;
+	off_t data_offset;
+	struct chfs_eb_hdr *ebhdr;
+
+	dbg("offset: %d | len: %zu | (offset+len): %zu "
+	    " | ebsize: %zu\n", offset, len, (offset+len), ebh->eb_size);
+
+	KASSERT(offset + len <= ebh->eb_size);
+
+	err = leb_write_lock(ebh, lnr);
+	if (err)
+		return err;
+
+	pebnr = ebh->lmap[lnr];
+	/* If the LEB is mapped write out data */
+	if (pebnr != EBH_LEB_UNMAPPED) {
+		data_offset = ebh->ops->calc_data_offs(ebh, pebnr, offset);
+		err = flash_write(ebh->flash_dev, data_offset, len, retlen,
+		    (unsigned char *) buf);
+
+		if (err) {
+			chfs_err("error %d while writing %zu bytes to PEB "
+			    "%d:%ju, written %zu bytes\n",
+			    err, len, pebnr, (uintmax_t )offset, *retlen);
+		} else {
+			KASSERT(len == *retlen);
+		}
+
+		leb_write_unlock(ebh, lnr);
+		return err;
+	}
+
+	/*
+	 * If the LEB is unmapped, get a free PEB and write the
+	 * eraseblock header first
+	 */
+	ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP);
+
+	/* Setting up eraseblock header properties */
+	ebh->ops->create_eb_hdr(ebhdr, lnr);
+
+retry:
+	/* Getting a physical eraseblock from the wear leveling system */
+	pebnr = get_peb(ebh);
+	if (pebnr < 0) {
+		leb_write_unlock(ebh, lnr);
+		kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+		return pebnr;
+	}
+
+	/* Write the eraseblock header to the media */
+	err = ebh->ops->write_eb_hdr(ebh, pebnr, ebhdr);
+	if (err) {
+		chfs_warn(
+			"error writing eraseblock header: LEB %d , PEB %d\n",
+			lnr, pebnr);
+		goto write_error;
+	}
+
+	/* Write out data */
+	if (len) {
+		data_offset = ebh->ops->calc_data_offs(ebh, pebnr, offset);
+		err = flash_write(ebh->flash_dev,
+		    data_offset, len, retlen, (unsigned char *) buf);
+		if (err) {
+			chfs_err("error %d while writing %zu bytes to PEB "
+			    " %d:%ju, written %zu bytes\n",
+			    err, len, pebnr, (uintmax_t )offset, *retlen);
+			goto write_error;
+		}
+	}
+
+	ebh->lmap[lnr] = pebnr;
+	leb_write_unlock(ebh, lnr);
+	kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+
+	return 0;
+
+write_error: err = release_peb(ebh, pebnr);
+	// max retries (NOW: 2)
+	if (err || CHFS_MAX_GET_PEB_RETRIES < ++retries) {
+		leb_write_unlock(ebh, lnr);
+		kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+		return err;
+	}
+	goto retry;
+}
+
+/**
+ * ebh_erase_leb - erase a leb
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_erase_leb(struct chfs_ebh *ebh, int lnr)
+{
+	int err, pebnr;
+
+	leb_write_lock(ebh, lnr);
+
+	pebnr = ebh->lmap[lnr];
+	if (pebnr < 0) {
+		leb_write_unlock(ebh, lnr);
+		return EBH_LEB_UNMAPPED;
+	}
+	err = release_peb(ebh, pebnr);
+	if (err)
+		goto out_unlock;
+
+	ebh->lmap[lnr] = EBH_LEB_UNMAPPED;
+	cv_signal(&ebh->bg_erase.eth_wakeup);
+out_unlock:
+	leb_write_unlock(ebh, lnr);
+	return err;
+}
+
+/**
+ * ebh_map_leb - maps a PEB to LEB
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ *
+ * Returns zero on success, error code in case of fail
+ */
+int
+ebh_map_leb(struct chfs_ebh *ebh, int lnr)
+{
+	int err, pebnr, retries = 0;
+	struct chfs_eb_hdr *ebhdr;
+
+	ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP);
+
+	err = leb_write_lock(ebh, lnr);
+	if (err)
+		return err;
+
+retry:
+	pebnr = get_peb(ebh);
+	if (pebnr < 0) {
+		err = pebnr;
+		goto out_unlock;
+	}
+
+	ebh->ops->create_eb_hdr(ebhdr, lnr);
+
+	err = ebh->ops->write_eb_hdr(ebh, pebnr, ebhdr);
+	if (err) {
+		chfs_warn(
+			"error writing eraseblock header: LEB %d , PEB %d\n",
+			lnr, pebnr);
+		goto write_error;
+	}
+
+	ebh->lmap[lnr] = pebnr;
+
+out_unlock:
+	leb_write_unlock(ebh, lnr);
+	return err;
+
+write_error:
+	err = release_peb(ebh, pebnr);
+	// max retries (NOW: 2)
+	if (err || CHFS_MAX_GET_PEB_RETRIES < ++retries) {
+		leb_write_unlock(ebh, lnr);
+		kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+		return err;
+	}
+	goto retry;
+}
+
+/**
+ * ebh_unmap_leb -
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ *
+ * Retruns zero on success, error code in case of fail.
+ */
+int
+ebh_unmap_leb(struct chfs_ebh *ebh, int lnr)
+{
+	int err;
+
+	if (ebh_is_mapped(ebh, lnr) < 0)
+		/* If the eraseblock already unmapped */
+		return 0;
+
+	err = ebh_erase_leb(ebh, lnr);
+
+	return err;
+}
+
+/**
+ * ebh_is_mapped - check if a PEB is mapped to @lnr
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ *
+ * Retruns 0 if the logical eraseblock is mapped, negative error code otherwise.
+ */
+int
+ebh_is_mapped(struct chfs_ebh *ebh, int lnr)
+{
+	int err, result;
+	err = leb_read_lock(ebh, lnr);
+	if (err)
+		return err;
+
+	result = ebh->lmap[lnr];
+	leb_read_unlock(ebh, lnr);
+
+	return result;
+}
+
+/**
+ * ebh_change_leb - write the LEB to another PEB
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ * @buf: data to write
+ * @len: length of data
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_change_leb(struct chfs_ebh *ebh, int lnr, char *buf, size_t len,
+    size_t *retlen)
+{
+	int err, pebnr, pebnr_old, retries = 0;
+	off_t data_offset;
+
+	struct chfs_peb *peb = NULL;
+	struct chfs_eb_hdr *ebhdr;
+
+	if (ebh_is_mapped(ebh, lnr) < 0)
+		return EBH_LEB_UNMAPPED;
+
+	if (len == 0) {
+		err = ebh_unmap_leb(ebh, lnr);
+		if (err)
+			return err;
+		return ebh_map_leb(ebh, lnr);
+	}
+
+	ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP);
+
+	pebnr_old = ebh->lmap[lnr];
+
+	mutex_enter(&ebh->alc_mutex);
+	err = leb_write_lock(ebh, lnr);
+	if (err)
+		goto out_mutex;
+
+	if (ebh->ops->mark_eb_hdr_dirty_flash) {
+		err = ebh->ops->mark_eb_hdr_dirty_flash(ebh, pebnr_old, lnr);
+		if (err)
+			goto out_unlock;
+	}
+
+	/* Setting up eraseblock header properties */
+	ebh->ops->create_eb_hdr(ebhdr, lnr);
+
+retry:
+	/* Getting a physical eraseblock from the wear leveling system */
+	pebnr = get_peb(ebh);
+	if (pebnr < 0) {
+		leb_write_unlock(ebh, lnr);
+		mutex_exit(&ebh->alc_mutex);
+		kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+		return pebnr;
+	}
+
+	err = ebh->ops->write_eb_hdr(ebh, pebnr, ebhdr);
+	if (err) {
+		chfs_warn(
+			"error writing eraseblock header: LEB %d , PEB %d",
+			lnr, pebnr);
+		goto write_error;
+	}
+
+	/* Write out data */
+	data_offset = ebh->ops->calc_data_offs(ebh, pebnr, 0);
+	err = flash_write(ebh->flash_dev, data_offset, len, retlen,
+	    (unsigned char *) buf);
+	if (err) {
+		chfs_err("error %d while writing %zu bytes to PEB %d:%ju,"
+		    " written %zu bytes",
+		    err, len, pebnr, (uintmax_t)data_offset, *retlen);
+		goto write_error;
+	}
+
+	ebh->lmap[lnr] = pebnr;
+
+	if (ebh->ops->invalidate_eb_hdr) {
+		err = ebh->ops->invalidate_eb_hdr(ebh, pebnr_old);
+		if (err)
+			goto out_unlock;
+	}
+	peb = find_peb_in_use(ebh, pebnr_old);
+	err = release_peb(ebh, peb->pebnr);
+
+out_unlock:
+	leb_write_unlock(ebh, lnr);
+
+out_mutex:
+	mutex_exit(&ebh->alc_mutex);
+	kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+	kmem_free(peb, sizeof(struct chfs_peb));
+	return err;
+
+write_error:
+	err = release_peb(ebh, pebnr);
+	//max retries (NOW: 2)
+	if (err || CHFS_MAX_GET_PEB_RETRIES < ++retries) {
+		leb_write_unlock(ebh, lnr);
+		mutex_exit(&ebh->alc_mutex);
+		kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+		return err;
+	}
+	goto retry;
+}
+
diff --git a/include/ufs/chfs/ebh.h b/sys/ufs/chfs/ebh.h
similarity index 100%
rename from include/ufs/chfs/ebh.h
rename to sys/ufs/chfs/ebh.h
diff --git a/include/ufs/chfs/ebh_media.h b/sys/ufs/chfs/ebh_media.h
similarity index 100%
rename from include/ufs/chfs/ebh_media.h
rename to sys/ufs/chfs/ebh_media.h
diff --git a/include/ufs/chfs/ebh_misc.h b/sys/ufs/chfs/ebh_misc.h
similarity index 100%
rename from include/ufs/chfs/ebh_misc.h
rename to sys/ufs/chfs/ebh_misc.h
diff --git a/include/ufs/chfs/media.h b/sys/ufs/chfs/media.h
similarity index 100%
rename from include/ufs/chfs/media.h
rename to sys/ufs/chfs/media.h
diff --git a/sys/ufs/ext2fs/Makefile b/sys/ufs/ext2fs/Makefile
new file mode 100644
index 000000000..a3df42f59
--- /dev/null
+++ b/sys/ufs/ext2fs/Makefile
@@ -0,0 +1,7 @@
+#	$NetBSD: Makefile,v 1.1 1998/06/12 23:23:11 cgd Exp $
+
+INCSDIR= /usr/include/ufs/ext2fs
+
+INCS=	ext2fs.h ext2fs_dinode.h ext2fs_dir.h ext2fs_extern.h
+
+.include <bsd.kinc.mk>
diff --git a/include/ufs/ext2fs/ext2fs.h b/sys/ufs/ext2fs/ext2fs.h
similarity index 100%
rename from include/ufs/ext2fs/ext2fs.h
rename to sys/ufs/ext2fs/ext2fs.h
diff --git a/sys/ufs/ext2fs/ext2fs_alloc.c b/sys/ufs/ext2fs/ext2fs_alloc.c
new file mode 100644
index 000000000..9c2b4cf40
--- /dev/null
+++ b/sys/ufs/ext2fs/ext2fs_alloc.c
@@ -0,0 +1,637 @@
+/*	$NetBSD: ext2fs_alloc.c,v 1.42 2011/03/06 04:46:26 rmind Exp $	*/
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ffs_alloc.c	8.11 (Berkeley) 10/27/94
+ *  Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *	@(#)ffs_alloc.c	8.11 (Berkeley) 10/27/94
+ *  Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_alloc.c,v 1.42 2011/03/06 04:46:26 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kernel.h>
+#include <sys/syslog.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufsmount.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+u_long ext2gennumber;
+
+static daddr_t	ext2fs_alloccg(struct inode *, int, daddr_t, int);
+static u_long	ext2fs_dirpref(struct m_ext2fs *);
+static void	ext2fs_fserr(struct m_ext2fs *, u_int, const char *);
+static u_long	ext2fs_hashalloc(struct inode *, int, long, int,
+		    daddr_t (*)(struct inode *, int, daddr_t, int));
+static daddr_t	ext2fs_nodealloccg(struct inode *, int, daddr_t, int);
+static daddr_t	ext2fs_mapsearch(struct m_ext2fs *, char *, daddr_t);
+
+/*
+ * Allocate a block in the file system.
+ *
+ * A preference may be optionally specified. If a preference is given
+ * the following hierarchy is used to allocate a block:
+ *   1) allocate the requested block.
+ *   2) allocate a rotationally optimal block in the same cylinder.
+ *   3) allocate a block in the same cylinder group.
+ *   4) quadradically rehash into other cylinder groups, until an
+ *	  available block is located.
+ * If no block preference is given the following hierarchy is used
+ * to allocate a block:
+ *   1) allocate a block in the cylinder group that contains the
+ *	  inode for the file.
+ *   2) quadradically rehash into other cylinder groups, until an
+ *	  available block is located.
+ */
+int
+ext2fs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref,
+    kauth_cred_t cred, daddr_t *bnp)
+{
+	struct m_ext2fs *fs;
+	daddr_t bno;
+	int cg;
+
+	*bnp = 0;
+	fs = ip->i_e2fs;
+#ifdef DIAGNOSTIC
+	if (cred == NOCRED)
+		panic("ext2fs_alloc: missing credential");
+#endif /* DIAGNOSTIC */
+	if (fs->e2fs.e2fs_fbcount == 0)
+		goto nospace;
+	if (kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
+	    NULL, NULL) != 0 &&
+	    freespace(fs) <= 0)
+		goto nospace;
+	if (bpref >= fs->e2fs.e2fs_bcount)
+		bpref = 0;
+	if (bpref == 0)
+		cg = ino_to_cg(fs, ip->i_number);
+	else
+		cg = dtog(fs, bpref);
+	bno = (daddr_t)ext2fs_hashalloc(ip, cg, bpref, fs->e2fs_bsize,
+	    ext2fs_alloccg);
+	if (bno > 0) {
+		ip->i_e2fs_nblock += btodb(fs->e2fs_bsize);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		*bnp = bno;
+		return (0);
+	}
+nospace:
+	ext2fs_fserr(fs, kauth_cred_geteuid(cred), "file system full");
+	uprintf("\n%s: write failed, file system is full\n", fs->e2fs_fsmnt);
+	return (ENOSPC);
+}
+
+/*
+ * Allocate an inode in the file system.
+ *
+ * If allocating a directory, use ext2fs_dirpref to select the inode.
+ * If allocating in a directory, the following hierarchy is followed:
+ *   1) allocate the preferred inode.
+ *   2) allocate an inode in the same cylinder group.
+ *   3) quadradically rehash into other cylinder groups, until an
+ *	  available inode is located.
+ * If no inode preference is given the following hierarchy is used
+ * to allocate an inode:
+ *   1) allocate an inode in cylinder group 0.
+ *   2) quadradically rehash into other cylinder groups, until an
+ *	  available inode is located.
+ */
+int
+ext2fs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
+    struct vnode **vpp)
+{
+	struct inode *pip;
+	struct m_ext2fs *fs;
+	struct inode *ip;
+	ino_t ino, ipref;
+	int cg, error;
+
+	*vpp = NULL;
+	pip = VTOI(pvp);
+	fs = pip->i_e2fs;
+	if (fs->e2fs.e2fs_ficount == 0)
+		goto noinodes;
+
+	if ((mode & IFMT) == IFDIR)
+		cg = ext2fs_dirpref(fs);
+	else
+		cg = ino_to_cg(fs, pip->i_number);
+	ipref = cg * fs->e2fs.e2fs_ipg + 1;
+	ino = (ino_t)ext2fs_hashalloc(pip, cg, (long)ipref, mode, ext2fs_nodealloccg);
+	if (ino == 0)
+		goto noinodes;
+	error = VFS_VGET(pvp->v_mount, ino, vpp);
+	if (error) {
+		ext2fs_vfree(pvp, ino, mode);
+		return (error);
+	}
+	ip = VTOI(*vpp);
+	if (ip->i_e2fs_mode && ip->i_e2fs_nlink != 0) {
+		printf("mode = 0%o, nlinks %d, inum = %llu, fs = %s\n",
+		    ip->i_e2fs_mode, ip->i_e2fs_nlink,
+		    (unsigned long long)ip->i_number, fs->e2fs_fsmnt);
+		panic("ext2fs_valloc: dup alloc");
+	}
+
+	memset(ip->i_din.e2fs_din, 0, sizeof(struct ext2fs_dinode));
+
+	/*
+	 * Set up a new generation number for this inode.
+	 */
+	if (++ext2gennumber < time_second)
+		ext2gennumber = time_second;
+	ip->i_e2fs_gen = ext2gennumber;
+	return (0);
+noinodes:
+	ext2fs_fserr(fs, kauth_cred_geteuid(cred), "out of inodes");
+	uprintf("\n%s: create/symlink failed, no inodes free\n", fs->e2fs_fsmnt);
+	return (ENOSPC);
+}
+
+/*
+ * Find a cylinder to place a directory.
+ *
+ * The policy implemented by this algorithm is to select from
+ * among those cylinder groups with above the average number of
+ * free inodes, the one with the smallest number of directories.
+ */
+static u_long
+ext2fs_dirpref(struct m_ext2fs *fs)
+{
+	int cg, maxspace, mincg, avgifree;
+
+	avgifree = fs->e2fs.e2fs_ficount / fs->e2fs_ncg;
+	maxspace = 0;
+	mincg = -1;
+	for (cg = 0; cg < fs->e2fs_ncg; cg++)
+		if ( fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree) {
+			if (mincg == -1 || fs->e2fs_gd[cg].ext2bgd_nbfree > maxspace) {
+				mincg = cg;
+				maxspace = fs->e2fs_gd[cg].ext2bgd_nbfree;
+			}
+		}
+	return mincg;
+}
+
+/*
+ * Select the desired position for the next block in a file.  The file is
+ * logically divided into sections. The first section is composed of the
+ * direct blocks. Each additional section contains fs_maxbpg blocks.
+ *
+ * If no blocks have been allocated in the first section, the policy is to
+ * request a block in the same cylinder group as the inode that describes
+ * the file. Otherwise, the policy is to try to allocate the blocks
+ * contigously. The two fields of the ext2 inode extension (see
+ * ufs/ufs/inode.h) help this.
+ */
+daddr_t
+ext2fs_blkpref(struct inode *ip, daddr_t lbn, int indx,
+		int32_t *bap /* XXX ondisk32 */)
+{
+	struct m_ext2fs *fs;
+	int cg, i;
+
+	fs = ip->i_e2fs;
+	/*
+	 * if we are doing contigous lbn allocation, try to alloc blocks
+	 * contigously on disk
+	 */
+
+	if ( ip->i_e2fs_last_blk && lbn == ip->i_e2fs_last_lblk + 1) {
+		return ip->i_e2fs_last_blk + 1;
+	}
+
+	/*
+	 * bap, if provided, gives us a list of blocks to which we want to
+	 * stay close
+	 */
+
+	if (bap) {
+		for (i = indx; i >= 0 ; i--) {
+			if (bap[i]) {
+				return fs2h32(bap[i]) + 1;
+			}
+		}
+	}
+
+	/* fall back to the first block of the cylinder containing the inode */
+
+	cg = ino_to_cg(fs, ip->i_number);
+	return fs->e2fs.e2fs_bpg * cg + fs->e2fs.e2fs_first_dblock + 1;
+}
+
+/*
+ * Implement the cylinder overflow algorithm.
+ *
+ * The policy implemented by this algorithm is:
+ *   1) allocate the block in its requested cylinder group.
+ *   2) quadradically rehash on the cylinder group number.
+ *   3) brute force search for a free block.
+ */
+static u_long
+ext2fs_hashalloc(struct inode *ip, int cg, long pref, int size,
+		daddr_t (*allocator)(struct inode *, int, daddr_t, int))
+{
+	struct m_ext2fs *fs;
+	long result;
+	int i, icg = cg;
+
+	fs = ip->i_e2fs;
+	/*
+	 * 1: preferred cylinder group
+	 */
+	result = (*allocator)(ip, cg, pref, size);
+	if (result)
+		return (result);
+	/*
+	 * 2: quadratic rehash
+	 */
+	for (i = 1; i < fs->e2fs_ncg; i *= 2) {
+		cg += i;
+		if (cg >= fs->e2fs_ncg)
+			cg -= fs->e2fs_ncg;
+		result = (*allocator)(ip, cg, 0, size);
+		if (result)
+			return (result);
+	}
+	/*
+	 * 3: brute force search
+	 * Note that we start at i == 2, since 0 was checked initially,
+	 * and 1 is always checked in the quadratic rehash.
+	 */
+	cg = (icg + 2) % fs->e2fs_ncg;
+	for (i = 2; i < fs->e2fs_ncg; i++) {
+		result = (*allocator)(ip, cg, 0, size);
+		if (result)
+			return (result);
+		cg++;
+		if (cg == fs->e2fs_ncg)
+			cg = 0;
+	}
+	return (0);
+}
+
+/*
+ * Determine whether a block can be allocated.
+ *
+ * Check to see if a block of the appropriate size is available,
+ * and if it is, allocate it.
+ */
+
+static daddr_t
+ext2fs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
+{
+	struct m_ext2fs *fs;
+	char *bbp;
+	struct buf *bp;
+	/* XXX ondisk32 */
+	int error, bno, start, end, loc;
+
+	fs = ip->i_e2fs;
+	if (fs->e2fs_gd[cg].ext2bgd_nbfree == 0)
+		return (0);
+	error = bread(ip->i_devvp, fsbtodb(fs,
+		fs->e2fs_gd[cg].ext2bgd_b_bitmap),
+		(int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return (0);
+	}
+	bbp = (char *)bp->b_data;
+
+	if (dtog(fs, bpref) != cg)
+		bpref = 0;
+	if (bpref != 0) {
+		bpref = dtogd(fs, bpref);
+		/*
+		 * if the requested block is available, use it
+		 */
+		if (isclr(bbp, bpref)) {
+			bno = bpref;
+			goto gotit;
+		}
+	}
+	/*
+	 * no blocks in the requested cylinder, so take next
+	 * available one in this cylinder group.
+	 * first try to get 8 contigous blocks, then fall back to a single
+	 * block.
+	 */
+	if (bpref)
+		start = dtogd(fs, bpref) / NBBY;
+	else
+		start = 0;
+	end = howmany(fs->e2fs.e2fs_fpg, NBBY) - start;
+	for (loc = start; loc < end; loc++) {
+		if (bbp[loc] == 0) {
+			bno = loc * NBBY;
+			goto gotit;
+		}
+	}
+	for (loc = 0; loc < start; loc++) {
+		if (bbp[loc] == 0) {
+			bno = loc * NBBY;
+			goto gotit;
+		}
+	}
+
+	bno = ext2fs_mapsearch(fs, bbp, bpref);
+	if (bno < 0)
+		return (0);
+gotit:
+#ifdef DIAGNOSTIC
+	if (isset(bbp, (daddr_t)bno)) {
+		printf("ext2fs_alloccgblk: cg=%d bno=%d fs=%s\n",
+			cg, bno, fs->e2fs_fsmnt);
+		panic("ext2fs_alloccg: dup alloc");
+	}
+#endif
+	setbit(bbp, (daddr_t)bno);
+	fs->e2fs.e2fs_fbcount--;
+	fs->e2fs_gd[cg].ext2bgd_nbfree--;
+	fs->e2fs_fmod = 1;
+	bdwrite(bp);
+	return (cg * fs->e2fs.e2fs_fpg + fs->e2fs.e2fs_first_dblock + bno);
+}
+
+/*
+ * Determine whether an inode can be allocated.
+ *
+ * Check to see if an inode is available, and if it is,
+ * allocate it using the following policy:
+ *   1) allocate the requested inode.
+ *   2) allocate the next available inode after the requested
+ *	  inode in the specified cylinder group.
+ */
+static daddr_t
+ext2fs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
+{
+	struct m_ext2fs *fs;
+	char *ibp;
+	struct buf *bp;
+	int error, start, len, loc, map, i;
+
+	ipref--; /* to avoid a lot of (ipref -1) */
+	if (ipref == -1)
+		ipref = 0;
+	fs = ip->i_e2fs;
+	if (fs->e2fs_gd[cg].ext2bgd_nifree == 0)
+		return (0);
+	error = bread(ip->i_devvp, fsbtodb(fs,
+		fs->e2fs_gd[cg].ext2bgd_i_bitmap),
+		(int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return (0);
+	}
+	ibp = (char *)bp->b_data;
+	if (ipref) {
+		ipref %= fs->e2fs.e2fs_ipg;
+		if (isclr(ibp, ipref))
+			goto gotit;
+	}
+	start = ipref / NBBY;
+	len = howmany(fs->e2fs.e2fs_ipg - ipref, NBBY);
+	loc = skpc(0xff, len, &ibp[start]);
+	if (loc == 0) {
+		len = start + 1;
+		start = 0;
+		loc = skpc(0xff, len, &ibp[0]);
+		if (loc == 0) {
+			printf("cg = %d, ipref = %lld, fs = %s\n",
+				cg, (long long)ipref, fs->e2fs_fsmnt);
+			panic("ext2fs_nodealloccg: map corrupted");
+			/* NOTREACHED */
+		}
+	}
+	i = start + len - loc;
+	map = ibp[i] ^ 0xff;
+	if (map == 0) {
+		printf("fs = %s\n", fs->e2fs_fsmnt);
+		panic("ext2fs_nodealloccg: block not in map");
+	}
+	ipref = i * NBBY + ffs(map) - 1;
+gotit:
+	setbit(ibp, ipref);
+	fs->e2fs.e2fs_ficount--;
+	fs->e2fs_gd[cg].ext2bgd_nifree--;
+	fs->e2fs_fmod = 1;
+	if ((mode & IFMT) == IFDIR) {
+		fs->e2fs_gd[cg].ext2bgd_ndirs++;
+	}
+	bdwrite(bp);
+	return (cg * fs->e2fs.e2fs_ipg + ipref +1);
+}
+
+/*
+ * Free a block.
+ *
+ * The specified block is placed back in the
+ * free map.
+ */
+void
+ext2fs_blkfree(struct inode *ip, daddr_t bno)
+{
+	struct m_ext2fs *fs;
+	char *bbp;
+	struct buf *bp;
+	int error, cg;
+
+	fs = ip->i_e2fs;
+	cg = dtog(fs, bno);
+	if ((u_int)bno >= fs->e2fs.e2fs_bcount) {
+		printf("bad block %lld, ino %llu\n", (long long)bno,
+		    (unsigned long long)ip->i_number);
+		ext2fs_fserr(fs, ip->i_uid, "bad block");
+		return;
+	}
+	error = bread(ip->i_devvp,
+		fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_b_bitmap),
+		(int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return;
+	}
+	bbp = (char *)bp->b_data;
+	bno = dtogd(fs, bno);
+	if (isclr(bbp, bno)) {
+		printf("dev = 0x%llx, block = %lld, fs = %s\n",
+		    (unsigned long long)ip->i_dev, (long long)bno,
+		    fs->e2fs_fsmnt);
+		panic("blkfree: freeing free block");
+	}
+	clrbit(bbp, bno);
+	fs->e2fs.e2fs_fbcount++;
+	fs->e2fs_gd[cg].ext2bgd_nbfree++;
+
+	fs->e2fs_fmod = 1;
+	bdwrite(bp);
+}
+
+/*
+ * Free an inode.
+ *
+ * The specified inode is placed back in the free map.
+ */
+int
+ext2fs_vfree(struct vnode *pvp, ino_t ino, int mode)
+{
+	struct m_ext2fs *fs;
+	char *ibp;
+	struct inode *pip;
+	struct buf *bp;
+	int error, cg;
+
+	pip = VTOI(pvp);
+	fs = pip->i_e2fs;
+	if ((u_int)ino > fs->e2fs.e2fs_icount || (u_int)ino < EXT2_FIRSTINO)
+		panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s",
+		    (unsigned long long)pip->i_dev, (unsigned long long)ino,
+		    fs->e2fs_fsmnt);
+	cg = ino_to_cg(fs, ino);
+	error = bread(pip->i_devvp,
+		fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_i_bitmap),
+		(int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return (0);
+	}
+	ibp = (char *)bp->b_data;
+	ino = (ino - 1) % fs->e2fs.e2fs_ipg;
+	if (isclr(ibp, ino)) {
+		printf("dev = 0x%llx, ino = %llu, fs = %s\n",
+		    (unsigned long long)pip->i_dev,
+		    (unsigned long long)ino, fs->e2fs_fsmnt);
+		if (fs->e2fs_ronly == 0)
+			panic("ifree: freeing free inode");
+	}
+	clrbit(ibp, ino);
+	fs->e2fs.e2fs_ficount++;
+	fs->e2fs_gd[cg].ext2bgd_nifree++;
+	if ((mode & IFMT) == IFDIR) {
+		fs->e2fs_gd[cg].ext2bgd_ndirs--;
+	}
+	fs->e2fs_fmod = 1;
+	bdwrite(bp);
+	return (0);
+}
+
+/*
+ * Find a block in the specified cylinder group.
+ *
+ * It is a panic if a request is made to find a block if none are
+ * available.
+ */
+
+static daddr_t
+ext2fs_mapsearch(struct m_ext2fs *fs, char *bbp, daddr_t bpref)
+{
+	int start, len, loc, i, map;
+
+	/*
+	 * find the fragment by searching through the free block
+	 * map for an appropriate bit pattern
+	 */
+	if (bpref)
+		start = dtogd(fs, bpref) / NBBY;
+	else
+		start = 0;
+	len = howmany(fs->e2fs.e2fs_fpg, NBBY) - start;
+	loc = skpc(0xff, len, &bbp[start]);
+	if (loc == 0) {
+		len = start + 1;
+		start = 0;
+		loc = skpc(0xff, len, &bbp[start]);
+		if (loc == 0) {
+			printf("start = %d, len = %d, fs = %s\n",
+				start, len, fs->e2fs_fsmnt);
+			panic("ext2fs_alloccg: map corrupted");
+			/* NOTREACHED */
+		}
+	}
+	i = start + len - loc;
+	map = bbp[i] ^ 0xff;
+	if (map == 0) {
+		printf("fs = %s\n", fs->e2fs_fsmnt);
+		panic("ext2fs_mapsearch: block not in map");
+	}
+	return i * NBBY + ffs(map) - 1;
+}
+
+/*
+ * Fserr prints the name of a file system with an error diagnostic.
+ *
+ * The form of the error message is:
+ *	fs: error message
+ */
+static void
+ext2fs_fserr(struct m_ext2fs *fs, u_int uid, const char *cp)
+{
+
+	log(LOG_ERR, "uid %d on %s: %s\n", uid, fs->e2fs_fsmnt, cp);
+}
diff --git a/sys/ufs/ext2fs/ext2fs_balloc.c b/sys/ufs/ext2fs/ext2fs_balloc.c
new file mode 100644
index 000000000..6564bf905
--- /dev/null
+++ b/sys/ufs/ext2fs/ext2fs_balloc.c
@@ -0,0 +1,403 @@
+/*	$NetBSD: ext2fs_balloc.c,v 1.34 2009/10/19 18:41:17 bouyer Exp $	*/
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ffs_balloc.c	8.4 (Berkeley) 9/23/93
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *	@(#)ffs_balloc.c	8.4 (Berkeley) 9/23/93
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_balloc.c,v 1.34 2009/10/19 18:41:17 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_uvmhist.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/file.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+
+#include <uvm/uvm.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+/*
+ * Balloc defines the structure of file system storage
+ * by allocating the physical blocks on a device given
+ * the inode and the logical block number in a file.
+ */
+int
+ext2fs_balloc(struct inode *ip, daddr_t bn, int size,
+    kauth_cred_t cred, struct buf **bpp, int flags)
+{
+	struct m_ext2fs *fs;
+	daddr_t nb;
+	struct buf *bp, *nbp;
+	struct vnode *vp = ITOV(ip);
+	struct indir indirs[NIADDR + 2];
+	daddr_t newb, lbn, pref;
+	int32_t *bap;	/* XXX ondisk32 */
+	int num, i, error;
+	u_int deallocated;
+	daddr_t *blkp, *allocblk, allociblk[NIADDR + 1];
+	int32_t *allocib;	/* XXX ondisk32 */
+	int unwindidx = -1;
+	UVMHIST_FUNC("ext2fs_balloc"); UVMHIST_CALLED(ubchist);
+
+	UVMHIST_LOG(ubchist, "bn 0x%x", bn,0,0,0);
+
+	if (bpp != NULL) {
+		*bpp = NULL;
+	}
+	if (bn < 0)
+		return (EFBIG);
+	fs = ip->i_e2fs;
+	lbn = bn;
+
+	/*
+	 * The first NDADDR blocks are direct blocks
+	 */
+	if (bn < NDADDR) {
+		/* XXX ondisk32 */
+		nb = fs2h32(ip->i_e2fs_blocks[bn]);
+		if (nb != 0) {
+
+			/*
+			 * the block is already allocated, just read it.
+			 */
+
+			if (bpp != NULL) {
+				error = bread(vp, bn, fs->e2fs_bsize, NOCRED,
+					      B_MODIFY, &bp);
+				if (error) {
+					brelse(bp, 0);
+					return (error);
+				}
+				*bpp = bp;
+			}
+			return (0);
+		}
+
+		/*
+		 * allocate a new direct block.
+		 */
+
+		error = ext2fs_alloc(ip, bn,
+		    ext2fs_blkpref(ip, bn, bn, &ip->i_e2fs_blocks[0]),
+		    cred, &newb);
+		if (error)
+			return (error);
+		ip->i_e2fs_last_lblk = lbn;
+		ip->i_e2fs_last_blk = newb;
+		/* XXX ondisk32 */
+		ip->i_e2fs_blocks[bn] = h2fs32((int32_t)newb);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		if (bpp != NULL) {
+			bp = getblk(vp, bn, fs->e2fs_bsize, 0, 0);
+			bp->b_blkno = fsbtodb(fs, newb);
+			if (flags & B_CLRBUF)
+				clrbuf(bp);
+			*bpp = bp;
+		}
+		return (0);
+	}
+	/*
+	 * Determine the number of levels of indirection.
+	 */
+	pref = 0;
+	if ((error = ufs_getlbns(vp, bn, indirs, &num)) != 0)
+		return(error);
+#ifdef DIAGNOSTIC
+	if (num < 1)
+		panic ("ext2fs_balloc: ufs_getlbns returned indirect block\n");
+#endif
+	/*
+	 * Fetch the first indirect block allocating if necessary.
+	 */
+	--num;
+	/* XXX ondisk32 */
+	nb = fs2h32(ip->i_e2fs_blocks[NDADDR + indirs[0].in_off]);
+	allocib = NULL;
+	allocblk = allociblk;
+	if (nb == 0) {
+		pref = ext2fs_blkpref(ip, lbn, 0, (int32_t *)0);
+		error = ext2fs_alloc(ip, lbn, pref, cred, &newb);
+		if (error)
+			return (error);
+		nb = newb;
+		*allocblk++ = nb;
+		ip->i_e2fs_last_blk = newb;
+		bp = getblk(vp, indirs[1].in_lbn, fs->e2fs_bsize, 0, 0);
+		bp->b_blkno = fsbtodb(fs, newb);
+		clrbuf(bp);
+		/*
+		 * Write synchronously so that indirect blocks
+		 * never point at garbage.
+		 */
+		if ((error = bwrite(bp)) != 0)
+			goto fail;
+		unwindidx = 0;
+		allocib = &ip->i_e2fs_blocks[NDADDR + indirs[0].in_off];
+		/* XXX ondisk32 */
+		*allocib = h2fs32((int32_t)newb);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	}
+	/*
+	 * Fetch through the indirect blocks, allocating as necessary.
+	 */
+	for (i = 1;;) {
+		error = bread(vp,
+		    indirs[i].in_lbn, (int)fs->e2fs_bsize, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		bap = (int32_t *)bp->b_data;	/* XXX ondisk32 */
+		nb = fs2h32(bap[indirs[i].in_off]);
+		if (i == num)
+			break;
+		i++;
+		if (nb != 0) {
+			brelse(bp, 0);
+			continue;
+		}
+		pref = ext2fs_blkpref(ip, lbn, 0, (int32_t *)0);
+		error = ext2fs_alloc(ip, lbn, pref, cred, &newb);
+		if (error) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		nb = newb;
+		*allocblk++ = nb;
+		ip->i_e2fs_last_blk = newb;
+		nbp = getblk(vp, indirs[i].in_lbn, fs->e2fs_bsize, 0, 0);
+		nbp->b_blkno = fsbtodb(fs, nb);
+		clrbuf(nbp);
+		/*
+		 * Write synchronously so that indirect blocks
+		 * never point at garbage.
+		 */
+		if ((error = bwrite(nbp)) != 0) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		if (unwindidx < 0)
+			unwindidx = i - 1;
+		/* XXX ondisk32 */
+		bap[indirs[i - 1].in_off] = h2fs32((int32_t)nb);
+		/*
+		 * If required, write synchronously, otherwise use
+		 * delayed write.
+		 */
+		if (flags & B_SYNC) {
+			bwrite(bp);
+		} else {
+			bdwrite(bp);
+		}
+	}
+	/*
+	 * Get the data block, allocating if necessary.
+	 */
+	if (nb == 0) {
+		pref = ext2fs_blkpref(ip, lbn, indirs[num].in_off, &bap[0]);
+		error = ext2fs_alloc(ip, lbn, pref, cred, &newb);
+		if (error) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		nb = newb;
+		*allocblk++ = nb;
+		ip->i_e2fs_last_lblk = lbn;
+		ip->i_e2fs_last_blk = newb;
+		/* XXX ondisk32 */
+		bap[indirs[num].in_off] = h2fs32((int32_t)nb);
+		/*
+		 * If required, write synchronously, otherwise use
+		 * delayed write.
+		 */
+		if (flags & B_SYNC) {
+			bwrite(bp);
+		} else {
+			bdwrite(bp);
+		}
+		if (bpp != NULL) {
+			nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0);
+			nbp->b_blkno = fsbtodb(fs, nb);
+			if (flags & B_CLRBUF)
+				clrbuf(nbp);
+			*bpp = nbp;
+		}
+		return (0);
+	}
+	brelse(bp, 0);
+	if (bpp != NULL) {
+		if (flags & B_CLRBUF) {
+			error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED,
+				      B_MODIFY, &nbp);
+			if (error) {
+				brelse(nbp, 0);
+				goto fail;
+			}
+		} else {
+			nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0);
+			nbp->b_blkno = fsbtodb(fs, nb);
+		}
+		*bpp = nbp;
+	}
+	return (0);
+fail:
+	/*
+	 * If we have failed part way through block allocation, we
+	 * have to deallocate any indirect blocks that we have allocated.
+	 */
+	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
+		ext2fs_blkfree(ip, *blkp);
+		deallocated += fs->e2fs_bsize;
+	}
+	if (unwindidx >= 0) {
+		if (unwindidx == 0) {
+			*allocib = 0;
+		} else {
+			int r;
+
+			r = bread(vp, indirs[unwindidx].in_lbn,
+			    (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+			if (r) {
+				panic("Could not unwind indirect block, error %d", r);
+				brelse(bp, 0);
+			} else {
+				bap = (int32_t *)bp->b_data; /* XXX ondisk32 */
+				bap[indirs[unwindidx].in_off] = 0;
+				if (flags & B_SYNC)
+					bwrite(bp);
+				else
+					bdwrite(bp);
+			}
+		}
+		for (i = unwindidx + 1; i <= num; i++) {
+			bp = getblk(vp, indirs[i].in_lbn, (int)fs->e2fs_bsize,
+			    0, 0);
+			brelse(bp, BC_INVAL);
+		}
+	}
+	if (deallocated) {
+		ip->i_e2fs_nblock -= btodb(deallocated);
+		ip->i_e2fs_flags |= IN_CHANGE | IN_UPDATE;
+	}
+	return error;
+}
+
+int
+ext2fs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags,
+    kauth_cred_t cred)
+{
+	struct inode *ip = VTOI(vp);
+	struct m_ext2fs *fs = ip->i_e2fs;
+	int error, delta, bshift, bsize;
+	UVMHIST_FUNC("ext2fs_gop_alloc"); UVMHIST_CALLED(ubchist);
+
+	bshift = fs->e2fs_bshift;
+	bsize = 1 << bshift;
+
+	delta = off & (bsize - 1);
+	off -= delta;
+	len += delta;
+
+	while (len > 0) {
+		bsize = min(bsize, len);
+		UVMHIST_LOG(ubchist, "off 0x%x len 0x%x bsize 0x%x",
+			    off, len, bsize, 0);
+
+		error = ext2fs_balloc(ip, lblkno(fs, off), bsize, cred,
+		    NULL, flags);
+		if (error) {
+			UVMHIST_LOG(ubchist, "error %d", error, 0,0,0);
+			return error;
+		}
+
+		/*
+		 * increase file size now, ext2fs_balloc() requires that
+		 * EOF be up-to-date before each call.
+		 */
+
+		if (ext2fs_size(ip) < off + bsize) {
+			UVMHIST_LOG(ubchist, "old 0x%lx%8lx new 0x%lx%8lx",
+			    /* Note that arguments are always cast to u_long. */
+				    ext2fs_size(ip) >> 32,
+				    ext2fs_size(ip) & 0xffffffff,
+				    (off + bsize) >> 32,
+				    (off + bsize) & 0xffffffff);
+			error = ext2fs_setsize(ip, off + bsize);
+			if (error) {
+				UVMHIST_LOG(ubchist, "error %d", error, 0,0,0);
+				return error;
+			}
+		}
+
+		off += bsize;
+		len -= bsize;
+	}
+	return 0;
+}
diff --git a/sys/ufs/ext2fs/ext2fs_bmap.c b/sys/ufs/ext2fs/ext2fs_bmap.c
new file mode 100644
index 000000000..5336fddc4
--- /dev/null
+++ b/sys/ufs/ext2fs/ext2fs_bmap.c
@@ -0,0 +1,269 @@
+/*	$NetBSD: ext2fs_bmap.c,v 1.25 2009/10/19 18:41:17 bouyer Exp $	*/
+
+/*
+ * Copyright (c) 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_bmap.c	8.6 (Berkeley) 1/21/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *	@(#)ufs_bmap.c	8.6 (Berkeley) 1/21/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_bmap.c,v 1.25 2009/10/19 18:41:17 bouyer Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/trace.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+static int ext2fs_bmaparray(struct vnode *, daddr_t, daddr_t *,
+				struct indir *, int *, int *);
+
+#define	is_sequential(ump, a, b)	((b) == (a) + ump->um_seqinc)
+
+/*
+ * Bmap converts a the logical block number of a file to its physical block
+ * number on the disk. The conversion is done by using the logical block
+ * number to index into the array of block pointers described by the dinode.
+ */
+int
+ext2fs_bmap(void *v)
+{
+	struct vop_bmap_args /* {
+		struct vnode *a_vp;
+		daddr_t  a_bn;
+		struct vnode **a_vpp;
+		daddr_t *a_bnp;
+		int *a_runp;
+	} */ *ap = v;
+	/*
+	 * Check for underlying vnode requests and ensure that logical
+	 * to physical mapping is requested.
+	 */
+	if (ap->a_vpp != NULL)
+		*ap->a_vpp = VTOI(ap->a_vp)->i_devvp;
+	if (ap->a_bnp == NULL)
+		return (0);
+
+	return (ext2fs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL,
+		ap->a_runp));
+}
+
+/*
+ * Indirect blocks are now on the vnode for the file.  They are given negative
+ * logical block numbers.  Indirect blocks are addressed by the negative
+ * address of the first data block to which they point.  Double indirect blocks
+ * are addressed by one less than the address of the first indirect block to
+ * which they point.  Triple indirect blocks are addressed by one less than
+ * the address of the first double indirect block to which they point.
+ *
+ * ext2fs_bmaparray does the bmap conversion, and if requested returns the
+ * array of logical blocks which must be traversed to get to a block.
+ * Each entry contains the offset into that block that gets you to the
+ * next block and the disk address of the block (if it is assigned).
+ */
+
+int
+ext2fs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap,
+		int *nump, int *runp)
+{
+	struct inode *ip;
+	struct buf *bp, *cbp;
+	struct ufsmount *ump;
+	struct mount *mp;
+	struct indir a[NIADDR+1], *xap;
+	daddr_t daddr;
+	daddr_t metalbn;
+	int error, maxrun = 0, num;
+
+	ip = VTOI(vp);
+	mp = vp->v_mount;
+	ump = ip->i_ump;
+#ifdef DIAGNOSTIC
+	if ((ap != NULL && nump == NULL) || (ap == NULL && nump != NULL))
+		panic("ext2fs_bmaparray: invalid arguments");
+#endif
+
+	if (runp) {
+		/*
+		 * XXX
+		 * If MAXBSIZE is the largest transfer the disks can handle,
+		 * we probably want maxrun to be 1 block less so that we
+		 * don't create a block larger than the device can handle.
+		 */
+		*runp = 0;
+		maxrun = MAXBSIZE / mp->mnt_stat.f_iosize - 1;
+	}
+
+	if (bn >= 0 && bn < NDADDR) {
+		/* XXX ondisk32 */
+		*bnp = blkptrtodb(ump, fs2h32(ip->i_e2fs_blocks[bn]));
+		if (*bnp == 0)
+			*bnp = -1;
+		else if (runp)
+			/* XXX ondisk32 */
+			for (++bn; bn < NDADDR && *runp < maxrun &&
+				is_sequential(ump, (daddr_t)fs2h32(ip->i_e2fs_blocks[bn - 1]),
+							  (daddr_t)fs2h32(ip->i_e2fs_blocks[bn]));
+				++bn, ++*runp);
+		return (0);
+	}
+
+	xap = ap == NULL ? a : ap;
+	if (!nump)
+		nump = &num;
+	if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0)
+		return (error);
+
+	num = *nump;
+
+	/* Get disk address out of indirect block array */
+	/* XXX ondisk32 */
+	daddr = fs2h32(ip->i_e2fs_blocks[NDADDR + xap->in_off]);
+
+#ifdef DIAGNOSTIC
+    if (num > NIADDR + 1 || num < 1) {
+		printf("ext2fs_bmaparray: num=%d\n", num);
+		panic("ext2fs_bmaparray: num");
+	}
+#endif
+	for (bp = NULL, ++xap; --num; ++xap) {
+		/*
+		 * Exit the loop if there is no disk address assigned yet and
+		 * the indirect block isn't in the cache, or if we were
+		 * looking for an indirect block and we've found it.
+		 */
+
+		metalbn = xap->in_lbn;
+		if (metalbn == bn)
+			break;
+		if (daddr == 0) {
+			mutex_enter(&bufcache_lock);
+			cbp = incore(vp, metalbn);
+			mutex_exit(&bufcache_lock);
+			if (cbp == NULL)
+				break;
+		}
+		/*
+		 * If we get here, we've either got the block in the cache
+		 * or we have a disk address for it, go fetch it.
+		 */
+		if (bp)
+			brelse(bp, 0);
+
+		xap->in_exists = 1;
+		bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
+		if (bp == NULL) {
+
+			/*
+			 * getblk() above returns NULL only iff we are
+			 * pagedaemon.  See the implementation of getblk
+			 * for detail.
+			 */
+
+			 return (ENOMEM);
+		}
+		if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+			trace(TR_BREADHIT, pack(vp, size), metalbn);
+		}
+#ifdef DIAGNOSTIC
+		else if (!daddr)
+			panic("ext2fs_bmaparry: indirect block not in cache");
+#endif
+		else {
+			trace(TR_BREADMISS, pack(vp, size), metalbn);
+			bp->b_blkno = blkptrtodb(ump, daddr);
+			bp->b_flags |= B_READ;
+			VOP_STRATEGY(vp, bp);
+			curlwp->l_ru.ru_inblock++;	/* XXX */
+			if ((error = biowait(bp)) != 0) {
+				brelse(bp, 0);
+				return (error);
+			}
+		}
+
+		/* XXX ondisk32 */
+		daddr = fs2h32(((int32_t *)bp->b_data)[xap->in_off]);
+		if (num == 1 && daddr && runp)
+			/* XXX ondisk32 */
+			for (bn = xap->in_off + 1;
+				bn < MNINDIR(ump) && *runp < maxrun &&
+				is_sequential(ump, ((int32_t *)bp->b_data)[bn - 1],
+				((int32_t *)bp->b_data)[bn]);
+				++bn, ++*runp);
+	}
+	if (bp)
+		brelse(bp, 0);
+
+	daddr = blkptrtodb(ump, daddr);
+	*bnp = daddr == 0 ? -1 : daddr;
+	return (0);
+}
diff --git a/sys/ufs/ext2fs/ext2fs_bswap.c b/sys/ufs/ext2fs/ext2fs_bswap.c
new file mode 100644
index 000000000..ba0ddc462
--- /dev/null
+++ b/sys/ufs/ext2fs/ext2fs_bswap.c
@@ -0,0 +1,121 @@
+/*	$NetBSD: ext2fs_bswap.c,v 1.16 2009/10/19 18:41:17 bouyer Exp $	*/
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_bswap.c,v 1.16 2009/10/19 18:41:17 bouyer Exp $");
+
+#include <sys/types.h>
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_dinode.h>
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+/* These functions are only needed if native byte order is not big endian */
+#if BYTE_ORDER == BIG_ENDIAN
+void
+e2fs_sb_bswap(struct ext2fs *old, struct ext2fs *new)
+{
+
+	/* preserve unused fields */
+	memcpy(new, old, sizeof(struct ext2fs));
+	new->e2fs_icount	=	bswap32(old->e2fs_icount);
+	new->e2fs_bcount	=	bswap32(old->e2fs_bcount);
+	new->e2fs_rbcount	=	bswap32(old->e2fs_rbcount);
+	new->e2fs_fbcount	=	bswap32(old->e2fs_fbcount);
+	new->e2fs_ficount	=	bswap32(old->e2fs_ficount);
+	new->e2fs_first_dblock	=	bswap32(old->e2fs_first_dblock);
+	new->e2fs_log_bsize	=	bswap32(old->e2fs_log_bsize);
+	new->e2fs_fsize		=	bswap32(old->e2fs_fsize);
+	new->e2fs_bpg		=	bswap32(old->e2fs_bpg);
+	new->e2fs_fpg		=	bswap32(old->e2fs_fpg);
+	new->e2fs_ipg		=	bswap32(old->e2fs_ipg);
+	new->e2fs_mtime		=	bswap32(old->e2fs_mtime);
+	new->e2fs_wtime		=	bswap32(old->e2fs_wtime);
+	new->e2fs_mnt_count	=	bswap16(old->e2fs_mnt_count);
+	new->e2fs_max_mnt_count	=	bswap16(old->e2fs_max_mnt_count);
+	new->e2fs_magic		=	bswap16(old->e2fs_magic);
+	new->e2fs_state		=	bswap16(old->e2fs_state);
+	new->e2fs_beh		=	bswap16(old->e2fs_beh);
+	new->e2fs_minrev	=	bswap16(old->e2fs_minrev);
+	new->e2fs_lastfsck	=	bswap32(old->e2fs_lastfsck);
+	new->e2fs_fsckintv	=	bswap32(old->e2fs_fsckintv);
+	new->e2fs_creator	=	bswap32(old->e2fs_creator);
+	new->e2fs_rev		=	bswap32(old->e2fs_rev);
+	new->e2fs_ruid		=	bswap16(old->e2fs_ruid);
+	new->e2fs_rgid		=	bswap16(old->e2fs_rgid);
+	new->e2fs_first_ino	=	bswap32(old->e2fs_first_ino);
+	new->e2fs_inode_size	=	bswap16(old->e2fs_inode_size);
+	new->e2fs_block_group_nr =	bswap16(old->e2fs_block_group_nr);
+	new->e2fs_features_compat =	bswap32(old->e2fs_features_compat);
+	new->e2fs_features_incompat =	bswap32(old->e2fs_features_incompat);
+	new->e2fs_features_rocompat =	bswap32(old->e2fs_features_rocompat);
+	new->e2fs_algo		=	bswap32(old->e2fs_algo);
+	new->e2fs_reserved_ngdb	=	bswap16(old->e2fs_reserved_ngdb);
+}
+
+void e2fs_cg_bswap(struct ext2_gd *old, struct ext2_gd *new, int size)
+{
+	int i;
+
+	for (i = 0; i < (size / (int)sizeof(struct  ext2_gd)); i++) {
+		new[i].ext2bgd_b_bitmap	= bswap32(old[i].ext2bgd_b_bitmap);
+		new[i].ext2bgd_i_bitmap	= bswap32(old[i].ext2bgd_i_bitmap);
+		new[i].ext2bgd_i_tables	= bswap32(old[i].ext2bgd_i_tables);
+		new[i].ext2bgd_nbfree	= bswap16(old[i].ext2bgd_nbfree);
+		new[i].ext2bgd_nifree	= bswap16(old[i].ext2bgd_nifree);
+		new[i].ext2bgd_ndirs	= bswap16(old[i].ext2bgd_ndirs);
+	}
+}
+
+void e2fs_i_bswap(struct ext2fs_dinode *old, struct ext2fs_dinode *new)
+{
+
+	new->e2di_mode		=	bswap16(old->e2di_mode);
+	new->e2di_uid		=	bswap16(old->e2di_uid);
+	new->e2di_gid		=	bswap16(old->e2di_gid);
+	new->e2di_nlink		=	bswap16(old->e2di_nlink);
+	new->e2di_size		=	bswap32(old->e2di_size);
+	new->e2di_atime		=	bswap32(old->e2di_atime);
+	new->e2di_ctime		=	bswap32(old->e2di_ctime);
+	new->e2di_mtime		=	bswap32(old->e2di_mtime);
+	new->e2di_dtime		=	bswap32(old->e2di_dtime);
+	new->e2di_nblock	=	bswap32(old->e2di_nblock);
+	new->e2di_flags		=	bswap32(old->e2di_flags);
+	new->e2di_gen		=	bswap32(old->e2di_gen);
+	new->e2di_facl		=	bswap32(old->e2di_facl);
+	new->e2di_dacl		=	bswap32(old->e2di_dacl);
+	new->e2di_faddr		=	bswap32(old->e2di_faddr);
+	new->e2di_uid_high	=	bswap16(old->e2di_uid_high);
+	new->e2di_gid_high	=	bswap16(old->e2di_gid_high);
+	memcpy(&new->e2di_blocks[0], &old->e2di_blocks[0],
+	    (NDADDR + NIADDR) * sizeof(uint32_t));
+}
+#endif
diff --git a/include/ufs/ext2fs/ext2fs_dinode.h b/sys/ufs/ext2fs/ext2fs_dinode.h
similarity index 100%
rename from include/ufs/ext2fs/ext2fs_dinode.h
rename to sys/ufs/ext2fs/ext2fs_dinode.h
diff --git a/include/ufs/ext2fs/ext2fs_dir.h b/sys/ufs/ext2fs/ext2fs_dir.h
similarity index 100%
rename from include/ufs/ext2fs/ext2fs_dir.h
rename to sys/ufs/ext2fs/ext2fs_dir.h
diff --git a/include/ufs/ext2fs/ext2fs_extern.h b/sys/ufs/ext2fs/ext2fs_extern.h
similarity index 100%
rename from include/ufs/ext2fs/ext2fs_extern.h
rename to sys/ufs/ext2fs/ext2fs_extern.h
diff --git a/sys/ufs/ext2fs/ext2fs_inode.c b/sys/ufs/ext2fs/ext2fs_inode.c
new file mode 100644
index 000000000..0d52fb494
--- /dev/null
+++ b/sys/ufs/ext2fs/ext2fs_inode.c
@@ -0,0 +1,558 @@
+/*	$NetBSD: ext2fs_inode.c,v 1.74 2011/06/16 09:21:03 hannken Exp $	*/
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ffs_inode.c	8.8 (Berkeley) 10/19/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *	@(#)ffs_inode.c	8.8 (Berkeley) 10/19/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_inode.c,v 1.74 2011/06/16 09:21:03 hannken Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/file.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/trace.h>
+#include <sys/resourcevar.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+extern int prtactive;
+
+static int ext2fs_indirtrunc(struct inode *, daddr_t, daddr_t,
+				  daddr_t, int, long *);
+
+/*
+ * Get the size of an inode.
+ */
+uint64_t
+ext2fs_size(struct inode *ip)
+{
+	uint64_t size = ip->i_e2fs_size;
+
+	if ((ip->i_e2fs_mode & IFMT) == IFREG)
+		size |= (uint64_t)ip->i_e2fs_dacl << 32;
+	return size;
+}
+
+int
+ext2fs_setsize(struct inode *ip, uint64_t size)
+{
+	if ((ip->i_e2fs_mode & IFMT) == IFREG ||
+	    ip->i_e2fs_mode == 0) {
+		ip->i_e2fs_dacl = size >> 32;
+		if (size >= 0x80000000U) {
+			struct m_ext2fs *fs = ip->i_e2fs;
+
+			if (fs->e2fs.e2fs_rev <= E2FS_REV0) {
+				/* Linux automagically upgrades to REV1 here! */
+				return EFBIG;
+			}
+			if (!(fs->e2fs.e2fs_features_rocompat
+			    & EXT2F_ROCOMPAT_LARGEFILE)) {
+				fs->e2fs.e2fs_features_rocompat |=
+				    EXT2F_ROCOMPAT_LARGEFILE;
+				fs->e2fs_fmod = 1;
+			}
+		}
+	} else if (size >= 0x80000000U)
+		return EFBIG;
+
+	ip->i_e2fs_size = size;
+
+	return 0;
+}
+
+/*
+ * Last reference to an inode.  If necessary, write or delete it.
+ */
+int
+ext2fs_inactive(void *v)
+{
+	struct vop_inactive_args /* {
+		struct vnode *a_vp;
+		bool *a_recycle;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	int error = 0;
+
+	if (prtactive && vp->v_usecount != 0)
+		vprint("ext2fs_inactive: pushing active", vp);
+	/* Get rid of inodes related to stale file handles. */
+	if (ip->i_e2fs_mode == 0 || ip->i_e2fs_dtime != 0)
+		goto out;
+
+	error = 0;
+	if (ip->i_e2fs_nlink == 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
+		/* Defer final inode free and update to reclaim.*/
+		if (ext2fs_size(ip) != 0) {
+			error = ext2fs_truncate(vp, (off_t)0, 0, NOCRED);
+		}
+		ip->i_e2fs_dtime = time_second;
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		ip->i_omode = 1;
+	}
+	if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) {
+		ext2fs_update(vp, NULL, NULL, 0);
+	}
+out:
+	/*
+	 * If we are done with the inode, reclaim it
+	 * so that it can be reused immediately.
+	 */
+	*ap->a_recycle = (ip->i_e2fs_dtime != 0);
+	VOP_UNLOCK(vp);
+	return (error);
+}
+
+
+/*
+ * Update the access, modified, and inode change times as specified by the
+ * IACCESS, IUPDATE, and ICHANGE flags respectively. The IMODIFIED flag is
+ * used to specify that the inode needs to be updated but that the times have
+ * already been set. The access and modified times are taken from the second
+ * and third parameters; the inode change time is always taken from the current
+ * time. If UPDATE_WAIT or UPDATE_DIROP is set, then wait for the disk
+ * write of the inode to complete.
+ */
+int
+ext2fs_update(struct vnode *vp, const struct timespec *acc,
+    const struct timespec *mod, int updflags)
+{
+	struct m_ext2fs *fs;
+	struct buf *bp;
+	struct inode *ip;
+	int error;
+	void *cp;
+	int flags;
+
+	if (vp->v_mount->mnt_flag & MNT_RDONLY)
+		return (0);
+	ip = VTOI(vp);
+	EXT2FS_ITIMES(ip, acc, mod, NULL);
+	if (updflags & UPDATE_CLOSE)
+		flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED);
+	else
+		flags = ip->i_flag & IN_MODIFIED;
+	if (flags == 0)
+		return (0);
+	fs = ip->i_e2fs;
+
+	error = bread(ip->i_devvp,
+			  fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+			  (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return (error);
+	}
+	ip->i_flag &= ~(IN_MODIFIED | IN_ACCESSED);
+	cp = (char *)bp->b_data +
+	    (ino_to_fsbo(fs, ip->i_number) * EXT2_DINODE_SIZE(fs));
+	e2fs_isave(ip->i_din.e2fs_din, (struct ext2fs_dinode *)cp);
+	if ((updflags & (UPDATE_WAIT|UPDATE_DIROP)) != 0 &&
+	    (flags & IN_MODIFIED) != 0 &&
+	    (vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
+		return (bwrite(bp));
+	else {
+		bdwrite(bp);
+		return (0);
+	}
+}
+
+#define	SINGLE	0	/* index of single indirect block */
+#define	DOUBLE	1	/* index of double indirect block */
+#define	TRIPLE	2	/* index of triple indirect block */
+/*
+ * Truncate the inode oip to at most length size, freeing the
+ * disk blocks.
+ */
+int
+ext2fs_truncate(struct vnode *ovp, off_t length, int ioflag,
+    kauth_cred_t cred)
+{
+	daddr_t lastblock;
+	struct inode *oip = VTOI(ovp);
+	daddr_t bn, lastiblock[NIADDR], indir_lbn[NIADDR];
+	/* XXX ondisk32 */
+	int32_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
+	struct m_ext2fs *fs;
+	int offset, size, level;
+	long count, blocksreleased = 0;
+	int i, nblocks;
+	int error, allerror = 0;
+	off_t osize;
+	int sync;
+	struct ufsmount *ump = oip->i_ump;
+
+	if (ovp->v_type == VCHR || ovp->v_type == VBLK ||
+	    ovp->v_type == VFIFO || ovp->v_type == VSOCK) {
+		return 0;
+	}
+
+	if (length < 0)
+		return (EINVAL);
+
+	if (ovp->v_type == VLNK &&
+	    (ext2fs_size(oip) < ump->um_maxsymlinklen ||
+	     (ump->um_maxsymlinklen == 0 && oip->i_e2fs_nblock == 0))) {
+		KDASSERT(length == 0);
+		memset((char *)&oip->i_din.e2fs_din->e2di_shortlink, 0,
+			(u_int)ext2fs_size(oip));
+		(void)ext2fs_setsize(oip, 0);
+		oip->i_flag |= IN_CHANGE | IN_UPDATE;
+		return (ext2fs_update(ovp, NULL, NULL, 0));
+	}
+	if (ext2fs_size(oip) == length) {
+		/* still do a uvm_vnp_setsize() as writesize may be larger */
+		uvm_vnp_setsize(ovp, length);
+		oip->i_flag |= IN_CHANGE | IN_UPDATE;
+		return (ext2fs_update(ovp, NULL, NULL, 0));
+	}
+	fs = oip->i_e2fs;
+	if (length > ump->um_maxfilesize)
+		return (EFBIG);
+
+	osize = ext2fs_size(oip);
+
+	/*
+	 * Lengthen the size of the file. We must ensure that the
+	 * last byte of the file is allocated. Since the smallest
+	 * value of osize is 0, length will be at least 1.
+	 */
+	if (osize < length) {
+		uvm_vnp_setwritesize(ovp, length);
+		error = ufs_balloc_range(ovp, length - 1, 1, cred,
+		    ioflag & IO_SYNC ? B_SYNC : 0);
+		if (error) {
+			(void) ext2fs_truncate(ovp, osize, ioflag & IO_SYNC,
+			    cred);
+			return (error);
+		}
+		uvm_vnp_setsize(ovp, length);
+		oip->i_flag |= IN_CHANGE | IN_UPDATE;
+		KASSERT(error  || ovp->v_size == ext2fs_size(oip));
+		return (ext2fs_update(ovp, NULL, NULL, 0));
+	}
+	/*
+	 * Shorten the size of the file. If the file is not being
+	 * truncated to a block boundry, the contents of the
+	 * partial block following the end of the file must be
+	 * zero'ed in case it ever become accessible again because
+	 * of subsequent file growth.
+	 */
+	offset = blkoff(fs, length);
+	if (offset != 0) {
+		size = fs->e2fs_bsize;
+
+		/* XXXUBC we should handle more than just VREG */
+		ubc_zerorange(&ovp->v_uobj, length, size - offset,
+		    UBC_UNMAP_FLAG(ovp));
+	}
+	(void)ext2fs_setsize(oip, length);
+	uvm_vnp_setsize(ovp, length);
+	/*
+	 * Calculate index into inode's block list of
+	 * last direct and indirect blocks (if any)
+	 * which we want to keep.  Lastblock is -1 when
+	 * the file is truncated to 0.
+	 */
+	lastblock = lblkno(fs, length + fs->e2fs_bsize - 1) - 1;
+	lastiblock[SINGLE] = lastblock - NDADDR;
+	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
+	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
+	nblocks = btodb(fs->e2fs_bsize);
+	/*
+	 * Update file and block pointers on disk before we start freeing
+	 * blocks.  If we crash before free'ing blocks below, the blocks
+	 * will be returned to the free list.  lastiblock values are also
+	 * normalized to -1 for calls to ext2fs_indirtrunc below.
+	 */
+	memcpy((void *)oldblks, (void *)&oip->i_e2fs_blocks[0], sizeof oldblks);
+	sync = 0;
+	for (level = TRIPLE; level >= SINGLE; level--) {
+		if (lastiblock[level] < 0 && oldblks[NDADDR + level] != 0) {
+			sync = 1;
+			oip->i_e2fs_blocks[NDADDR + level] = 0;
+			lastiblock[level] = -1;
+		}
+	}
+	for (i = 0; i < NDADDR; i++) {
+		if (i > lastblock && oldblks[i] != 0) {
+			sync = 1;
+			oip->i_e2fs_blocks[i] = 0;
+		}
+	}
+	oip->i_flag |= IN_CHANGE | IN_UPDATE;
+	if (sync) {
+		error = ext2fs_update(ovp, NULL, NULL, UPDATE_WAIT);
+		if (error && !allerror)
+			allerror = error;
+	}
+
+	/*
+	 * Having written the new inode to disk, save its new configuration
+	 * and put back the old block pointers long enough to process them.
+	 * Note that we save the new block configuration so we can check it
+	 * when we are done.
+	 */
+	memcpy((void *)newblks, (void *)&oip->i_e2fs_blocks[0], sizeof newblks);
+	memcpy((void *)&oip->i_e2fs_blocks[0], (void *)oldblks, sizeof oldblks);
+
+	(void)ext2fs_setsize(oip, osize);
+	error = vtruncbuf(ovp, lastblock + 1, 0, 0);
+	if (error && !allerror)
+		allerror = error;
+
+	/*
+	 * Indirect blocks first.
+	 */
+	indir_lbn[SINGLE] = -NDADDR;
+	indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) -1;
+	indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
+	for (level = TRIPLE; level >= SINGLE; level--) {
+		/* XXX ondisk32 */
+		bn = fs2h32(oip->i_e2fs_blocks[NDADDR + level]);
+		if (bn != 0) {
+			error = ext2fs_indirtrunc(oip, indir_lbn[level],
+			    fsbtodb(fs, bn), lastiblock[level], level, &count);
+			if (error)
+				allerror = error;
+			blocksreleased += count;
+			if (lastiblock[level] < 0) {
+				oip->i_e2fs_blocks[NDADDR + level] = 0;
+				ext2fs_blkfree(oip, bn);
+				blocksreleased += nblocks;
+			}
+		}
+		if (lastiblock[level] >= 0)
+			goto done;
+	}
+
+	/*
+	 * All whole direct blocks or frags.
+	 */
+	for (i = NDADDR - 1; i > lastblock; i--) {
+		/* XXX ondisk32 */
+		bn = fs2h32(oip->i_e2fs_blocks[i]);
+		if (bn == 0)
+			continue;
+		oip->i_e2fs_blocks[i] = 0;
+		ext2fs_blkfree(oip, bn);
+		blocksreleased += btodb(fs->e2fs_bsize);
+	}
+
+done:
+#ifdef DIAGNOSTIC
+	for (level = SINGLE; level <= TRIPLE; level++)
+		if (newblks[NDADDR + level] !=
+		    oip->i_e2fs_blocks[NDADDR + level])
+			panic("ext2fs_truncate1");
+	for (i = 0; i < NDADDR; i++)
+		if (newblks[i] != oip->i_e2fs_blocks[i])
+			panic("ext2fs_truncate2");
+	if (length == 0 &&
+	    (!LIST_EMPTY(&ovp->v_cleanblkhd) ||
+	     !LIST_EMPTY(&ovp->v_dirtyblkhd)))
+		panic("ext2fs_truncate3");
+#endif /* DIAGNOSTIC */
+	/*
+	 * Put back the real size.
+	 */
+	(void)ext2fs_setsize(oip, length);
+	oip->i_e2fs_nblock -= blocksreleased;
+	oip->i_flag |= IN_CHANGE;
+	KASSERT(ovp->v_type != VREG || ovp->v_size == ext2fs_size(oip));
+	return (allerror);
+}
+
+/*
+ * Release blocks associated with the inode ip and stored in the indirect
+ * block bn.  Blocks are free'd in LIFO order up to (but not including)
+ * lastbn.  If level is greater than SINGLE, the block is an indirect block
+ * and recursive calls to indirtrunc must be used to cleanse other indirect
+ * blocks.
+ *
+ * NB: triple indirect blocks are untested.
+ */
+static int
+ext2fs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn,
+		int level, long *countp)
+{
+	int i;
+	struct buf *bp;
+	struct m_ext2fs *fs = ip->i_e2fs;
+	int32_t *bap;	/* XXX ondisk32 */
+	struct vnode *vp;
+	daddr_t nb, nlbn, last;
+	int32_t *copy = NULL;	/* XXX ondisk32 */
+	long blkcount, factor;
+	int nblocks, blocksreleased = 0;
+	int error = 0, allerror = 0;
+
+	/*
+	 * Calculate index in current block of last
+	 * block to be kept.  -1 indicates the entire
+	 * block so we need not calculate the index.
+	 */
+	factor = 1;
+	for (i = SINGLE; i < level; i++)
+		factor *= NINDIR(fs);
+	last = lastbn;
+	if (lastbn > 0)
+		last /= factor;
+	nblocks = btodb(fs->e2fs_bsize);
+	/*
+	 * Get buffer of block pointers, zero those entries corresponding
+	 * to blocks to be free'd, and update on disk copy first.  Since
+	 * double(triple) indirect before single(double) indirect, calls
+	 * to bmap on these blocks will fail.  However, we already have
+	 * the on disk address, so we have to set the b_blkno field
+	 * explicitly instead of letting bread do everything for us.
+	 */
+	vp = ITOV(ip);
+	bp = getblk(vp, lbn, (int)fs->e2fs_bsize, 0, 0);
+	if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+		/* Braces must be here in case trace evaluates to nothing. */
+		trace(TR_BREADHIT, pack(vp, fs->e2fs_bsize), lbn);
+	} else {
+		trace(TR_BREADMISS, pack(vp, fs->e2fs_bsize), lbn);
+		curlwp->l_ru.ru_inblock++;	/* pay for read */
+		bp->b_flags |= B_READ;
+		if (bp->b_bcount > bp->b_bufsize)
+			panic("ext2fs_indirtrunc: bad buffer size");
+		bp->b_blkno = dbn;
+		VOP_STRATEGY(vp, bp);
+		error = biowait(bp);
+	}
+	if (error) {
+		brelse(bp, 0);
+		*countp = 0;
+		return (error);
+	}
+
+	bap = (int32_t *)bp->b_data;	/* XXX ondisk32 */
+	if (lastbn >= 0) {
+		/* XXX ondisk32 */
+		copy = malloc(fs->e2fs_bsize, M_TEMP, M_WAITOK);
+		memcpy((void *)copy, (void *)bap, (u_int)fs->e2fs_bsize);
+		memset((void *)&bap[last + 1], 0,
+			(u_int)(NINDIR(fs) - (last + 1)) * sizeof (uint32_t));
+		error = bwrite(bp);
+		if (error)
+			allerror = error;
+		bap = copy;
+	}
+
+	/*
+	 * Recursively free totally unused blocks.
+	 */
+	for (i = NINDIR(fs) - 1,
+		nlbn = lbn + 1 - i * factor; i > last;
+		i--, nlbn += factor) {
+		/* XXX ondisk32 */
+		nb = fs2h32(bap[i]);
+		if (nb == 0)
+			continue;
+		if (level > SINGLE) {
+			error = ext2fs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
+						   (daddr_t)-1, level - 1,
+						   &blkcount);
+			if (error)
+				allerror = error;
+			blocksreleased += blkcount;
+		}
+		ext2fs_blkfree(ip, nb);
+		blocksreleased += nblocks;
+	}
+
+	/*
+	 * Recursively free last partial block.
+	 */
+	if (level > SINGLE && lastbn >= 0) {
+		last = lastbn % factor;
+		/* XXX ondisk32 */
+		nb = fs2h32(bap[i]);
+		if (nb != 0) {
+			error = ext2fs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
+						   last, level - 1, &blkcount);
+			if (error)
+				allerror = error;
+			blocksreleased += blkcount;
+		}
+	}
+
+	if (copy != NULL) {
+		free(copy, M_TEMP);
+	} else {
+		brelse(bp, BC_INVAL);
+	}
+
+	*countp = blocksreleased;
+	return (allerror);
+}
diff --git a/sys/ufs/ext2fs/ext2fs_lookup.c b/sys/ufs/ext2fs/ext2fs_lookup.c
new file mode 100644
index 000000000..eb59c45a0
--- /dev/null
+++ b/sys/ufs/ext2fs/ext2fs_lookup.c
@@ -0,0 +1,1079 @@
+/*	$NetBSD: ext2fs_lookup.c,v 1.66 2011/07/12 16:59:48 dholland Exp $	*/
+
+/*
+ * Modified for NetBSD 1.2E
+ * May 1997, Manuel Bouyer
+ * Laboratoire d'informatique de Paris VI
+ */
+/*
+ *  modified for Lites 1.1
+ *
+ *  Aug 1995, Godmar Back (gback@cs.utah.edu)
+ *  University of Utah, Department of Computer Science
+ */
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_lookup.c	8.6 (Berkeley) 4/1/94
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_lookup.c,v 1.66 2011/07/12 16:59:48 dholland Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/kauth.h>
+#include <sys/proc.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext2fs/ext2fs_extern.h>
+#include <ufs/ext2fs/ext2fs_dir.h>
+#include <ufs/ext2fs/ext2fs.h>
+
+extern	int dirchk;
+
+static void	ext2fs_dirconv2ffs(struct ext2fs_direct *e2dir,
+					  struct dirent *ffsdir);
+static int	ext2fs_dirbadentry(struct vnode *dp,
+					  struct ext2fs_direct *de,
+					  int entryoffsetinblock);
+
+/*
+ * the problem that is tackled below is the fact that FFS
+ * includes the terminating zero on disk while EXT2FS doesn't
+ * this implies that we need to introduce some padding.
+ * For instance, a filename "sbin" has normally a reclen 12
+ * in EXT2, but 16 in FFS.
+ * This reminds me of that Pepsi commercial: 'Kid saved a lousy nine cents...'
+ * If it wasn't for that, the complete ufs code for directories would
+ * have worked w/o changes (except for the difference in DIRBLKSIZ)
+ */
+static void
+ext2fs_dirconv2ffs(struct ext2fs_direct *e2dir, struct dirent *ffsdir)
+{
+	memset(ffsdir, 0, sizeof(struct dirent));
+	ffsdir->d_fileno = fs2h32(e2dir->e2d_ino);
+	ffsdir->d_namlen = e2dir->e2d_namlen;
+
+	ffsdir->d_type = DT_UNKNOWN;		/* don't know more here */
+#ifdef DIAGNOSTIC
+#if MAXNAMLEN < E2FS_MAXNAMLEN
+	/*
+	 * we should handle this more gracefully !
+	 */
+	if (e2dir->e2d_namlen > MAXNAMLEN)
+		panic("ext2fs: e2dir->e2d_namlen");
+#endif
+#endif
+	strncpy(ffsdir->d_name, e2dir->e2d_name, ffsdir->d_namlen);
+
+	/* Godmar thinks: since e2dir->e2d_reclen can be big and means
+	   nothing anyway, we compute our own reclen according to what
+	   we think is right
+	 */
+	ffsdir->d_reclen = _DIRENT_SIZE(ffsdir);
+}
+
+/*
+ * Vnode op for reading directories.
+ *
+ * Convert the on-disk entries to <sys/dirent.h> entries.
+ * the problem is that the conversion will blow up some entries by four bytes,
+ * so it can't be done in place. This is too bad. Right now the conversion is
+ * done entry by entry, the converted entry is sent via uiomove.
+ *
+ * XXX allocate a buffer, convert as many entries as possible, then send
+ * the whole buffer to uiomove
+ */
+int
+ext2fs_readdir(void *v)
+{
+	struct vop_readdir_args /* {
+		struct vnode *a_vp;
+		struct uio *a_uio;
+		kauth_cred_t a_cred;
+		int **a_eofflag;
+		off_t **a_cookies;
+		int ncookies;
+	} */ *ap = v;
+	struct uio *uio = ap->a_uio;
+	int error;
+	size_t e2fs_count, readcnt;
+	struct vnode *vp = ap->a_vp;
+	struct m_ext2fs *fs = VTOI(vp)->i_e2fs;
+
+	struct ext2fs_direct *dp;
+	struct dirent *dstd;
+	struct uio auio;
+	struct iovec aiov;
+	void *dirbuf;
+	off_t off = uio->uio_offset;
+	off_t *cookies = NULL;
+	int nc = 0, ncookies = 0;
+	int e2d_reclen;
+
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+
+	e2fs_count = uio->uio_resid;
+	/* Make sure we don't return partial entries. */
+	e2fs_count -= (uio->uio_offset + e2fs_count) & (fs->e2fs_bsize -1);
+	if (e2fs_count <= 0)
+		return (EINVAL);
+
+	auio = *uio;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	aiov.iov_len = e2fs_count;
+	auio.uio_resid = e2fs_count;
+	UIO_SETUP_SYSSPACE(&auio);
+	dirbuf = malloc(e2fs_count, M_TEMP, M_WAITOK);
+	dstd = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK | M_ZERO);
+	if (ap->a_ncookies) {
+		nc = e2fs_count / _DIRENT_MINSIZE((struct dirent *)0);
+		ncookies = nc;
+		cookies = malloc(sizeof (off_t) * ncookies, M_TEMP, M_WAITOK);
+		*ap->a_cookies = cookies;
+	}
+	memset(dirbuf, 0, e2fs_count);
+	aiov.iov_base = dirbuf;
+
+	error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred);
+	if (error == 0) {
+		readcnt = e2fs_count - auio.uio_resid;
+		for (dp = (struct ext2fs_direct *)dirbuf;
+			(char *)dp < (char *)dirbuf + readcnt; ) {
+			e2d_reclen = fs2h16(dp->e2d_reclen);
+			if (e2d_reclen == 0) {
+				error = EIO;
+				break;
+			}
+			ext2fs_dirconv2ffs(dp, dstd);
+			if(dstd->d_reclen > uio->uio_resid) {
+				break;
+			}
+			error = uiomove(dstd, dstd->d_reclen, uio);
+			if (error != 0) {
+				break;
+			}
+			off = off + e2d_reclen;
+			if (cookies != NULL) {
+				*cookies++ = off;
+				if (--ncookies <= 0){
+					break;  /* out of cookies */
+				}
+			}
+			/* advance dp */
+			dp = (struct ext2fs_direct *) ((char *)dp + e2d_reclen);
+		}
+		/* we need to correct uio_offset */
+		uio->uio_offset = off;
+	}
+	free(dirbuf, M_TEMP);
+	free(dstd, M_TEMP);
+	*ap->a_eofflag = ext2fs_size(VTOI(ap->a_vp)) <= uio->uio_offset;
+	if (ap->a_ncookies) {
+		if (error) {
+			free(*ap->a_cookies, M_TEMP);
+			*ap->a_ncookies = 0;
+			*ap->a_cookies = NULL;
+		} else
+			*ap->a_ncookies = nc - ncookies;
+	}
+	return (error);
+}
+
+/*
+ * Convert a component of a pathname into a pointer to a locked inode.
+ * This is a very central and rather complicated routine.
+ * If the file system is not maintained in a strict tree hierarchy,
+ * this can result in a deadlock situation (see comments in code below).
+ *
+ * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending
+ * on whether the name is to be looked up, created, renamed, or deleted.
+ * When CREATE, RENAME, or DELETE is specified, information usable in
+ * creating, renaming, or deleting a directory entry may be calculated.
+ * If flag has LOCKPARENT or'ed into it and the target of the pathname
+ * exists, lookup returns both the target and its parent directory locked.
+ * When creating or renaming and LOCKPARENT is specified, the target may
+ * not be ".".  When deleting and LOCKPARENT is specified, the target may
+ * be "."., but the caller must check to ensure it does an vrele and vput
+ * instead of two vputs.
+ *
+ * Overall outline of ext2fs_lookup:
+ *
+ *	check accessibility of directory
+ *	look for name in cache, if found, then if at end of path
+ *	  and deleting or creating, drop it, else return name
+ *	search for name in directory, to found or notfound
+ * notfound:
+ *	if creating, return locked directory, leaving info on available slots
+ *	else return error
+ * found:
+ *	if at end of path and deleting, return information to allow delete
+ *	if at end of path and rewriting (RENAME and LOCKPARENT), lock target
+ *	  inode and return info to allow rewrite
+ *	if not at end, add name to cache; if at end and neither creating
+ *	  nor deleting, add name to cache
+ */
+int
+ext2fs_lookup(void *v)
+{
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap = v;
+	struct vnode *vdp = ap->a_dvp;	/* vnode for directory being searched */
+	struct inode *dp = VTOI(vdp);	/* inode for directory being searched */
+	struct buf *bp;			/* a buffer of directory entries */
+	struct ext2fs_direct *ep;	/* the current directory entry */
+	int entryoffsetinblock;		/* offset of ep in bp's buffer */
+	enum {NONE, COMPACT, FOUND} slotstatus;
+	doff_t slotoffset;		/* offset of area with free space */
+	int slotsize;			/* size of area at slotoffset */
+	int slotfreespace;		/* amount of space free in slot */
+	int slotneeded;			/* size of the entry we're seeking */
+	int numdirpasses;		/* strategy for directory search */
+	doff_t endsearch;		/* offset to end directory search */
+	doff_t prevoff;			/* prev entry dp->i_offset */
+	struct vnode *pdp;		/* saved dp during symlink work */
+	struct vnode *tdp;		/* returned by VFS_VGET */
+	doff_t enduseful;		/* pointer past last used dir slot */
+	u_long bmask;			/* block offset mask */
+	int namlen, error;
+	struct vnode **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	kauth_cred_t cred = cnp->cn_cred;
+	int flags;
+	int nameiop = cnp->cn_nameiop;
+	struct ufsmount *ump = dp->i_ump;
+	int dirblksiz = ump->um_dirblksiz;
+	ino_t foundino;
+	struct ufs_lookup_results *results;
+
+	flags = cnp->cn_flags;
+
+	bp = NULL;
+	slotoffset = -1;
+	*vpp = NULL;
+
+	/*
+	 * Produce the auxiliary lookup results into i_crap. Increment
+	 * its serial number so elsewhere we can tell if we're using
+	 * stale results. This should not be done this way. XXX.
+	 */
+	results = &dp->i_crap;
+	dp->i_crapcounter++;
+
+	/*
+	 * Check accessiblity of directory.
+	 */
+	if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0)
+		return (error);
+
+	if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
+		return (EROFS);
+
+	/*
+	 * We now have a segment name to search for, and a directory to search.
+	 *
+	 * Before tediously performing a linear scan of the directory,
+	 * check the name cache to see if the directory/name pair
+	 * we are looking for is known already.
+	 */
+	if ((error = cache_lookup(vdp, vpp, cnp)) >= 0)
+		return (error);
+
+	/*
+	 * Suppress search for slots unless creating
+	 * file and at end of pathname, in which case
+	 * we watch for a place to put the new file in
+	 * case it doesn't already exist.
+	 */
+	slotstatus = FOUND;
+	slotfreespace = slotsize = slotneeded = 0;
+	if ((nameiop == CREATE || nameiop == RENAME) &&
+	    (flags & ISLASTCN)) {
+		slotstatus = NONE;
+		slotneeded = EXT2FS_DIRSIZ(cnp->cn_namelen);
+	}
+
+	/*
+	 * If there is cached information on a previous search of
+	 * this directory, pick up where we last left off.
+	 * We cache only lookups as these are the most common
+	 * and have the greatest payoff. Caching CREATE has little
+	 * benefit as it usually must search the entire directory
+	 * to determine that the entry does not exist. Caching the
+	 * location of the last DELETE or RENAME has not reduced
+	 * profiling time and hence has been removed in the interest
+	 * of simplicity.
+	 */
+	bmask = vdp->v_mount->mnt_stat.f_iosize - 1;
+	if (nameiop != LOOKUP || results->ulr_diroff == 0 ||
+	    results->ulr_diroff >= ext2fs_size(dp)) {
+		entryoffsetinblock = 0;
+		results->ulr_offset = 0;
+		numdirpasses = 1;
+	} else {
+		results->ulr_offset = results->ulr_diroff;
+		if ((entryoffsetinblock = results->ulr_offset & bmask) &&
+		    (error = ext2fs_blkatoff(vdp, (off_t)results->ulr_offset, NULL, &bp)))
+			return (error);
+		numdirpasses = 2;
+		nchstats.ncs_2passes++;
+	}
+	prevoff = results->ulr_offset;
+	endsearch = roundup(ext2fs_size(dp), dirblksiz);
+	enduseful = 0;
+
+searchloop:
+	while (results->ulr_offset < endsearch) {
+		if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD)
+			preempt();
+		/*
+		 * If necessary, get the next directory block.
+		 */
+		if ((results->ulr_offset & bmask) == 0) {
+			if (bp != NULL)
+				brelse(bp, 0);
+			error = ext2fs_blkatoff(vdp, (off_t)results->ulr_offset, NULL,
+			    &bp);
+			if (error != 0)
+				return (error);
+			entryoffsetinblock = 0;
+		}
+		/*
+		 * If still looking for a slot, and at a dirblksize
+		 * boundary, have to start looking for free space again.
+		 */
+		if (slotstatus == NONE &&
+		    (entryoffsetinblock & (dirblksiz - 1)) == 0) {
+			slotoffset = -1;
+			slotfreespace = 0;
+		}
+		/*
+		 * Get pointer to next entry.
+		 * Full validation checks are slow, so we only check
+		 * enough to insure forward progress through the
+		 * directory. Complete checks can be run by patching
+		 * "dirchk" to be true.
+		 */
+		KASSERT(bp != NULL);
+		ep = (struct ext2fs_direct *)
+			((char *)bp->b_data + entryoffsetinblock);
+		if (ep->e2d_reclen == 0 ||
+		    (dirchk &&
+		     ext2fs_dirbadentry(vdp, ep, entryoffsetinblock))) {
+			int i;
+
+			ufs_dirbad(dp, results->ulr_offset, "mangled entry");
+			i = dirblksiz - (entryoffsetinblock & (dirblksiz - 1));
+			results->ulr_offset += i;
+			entryoffsetinblock += i;
+			continue;
+		}
+
+		/*
+		 * If an appropriate sized slot has not yet been found,
+		 * check to see if one is available. Also accumulate space
+		 * in the current block so that we can determine if
+		 * compaction is viable.
+		 */
+		if (slotstatus != FOUND) {
+			int size = fs2h16(ep->e2d_reclen);
+
+			if (ep->e2d_ino != 0)
+				size -= EXT2FS_DIRSIZ(ep->e2d_namlen);
+			if (size > 0) {
+				if (size >= slotneeded) {
+					slotstatus = FOUND;
+					slotoffset = results->ulr_offset;
+					slotsize = fs2h16(ep->e2d_reclen);
+				} else if (slotstatus == NONE) {
+					slotfreespace += size;
+					if (slotoffset == -1)
+						slotoffset = results->ulr_offset;
+					if (slotfreespace >= slotneeded) {
+						slotstatus = COMPACT;
+						slotsize = results->ulr_offset +
+						    fs2h16(ep->e2d_reclen) -
+						    slotoffset;
+					}
+				}
+			}
+		}
+
+		/*
+		 * Check for a name match.
+		 */
+		if (ep->e2d_ino) {
+			namlen = ep->e2d_namlen;
+			if (namlen == cnp->cn_namelen &&
+			    !memcmp(cnp->cn_nameptr, ep->e2d_name,
+			    (unsigned)namlen)) {
+				/*
+				 * Save directory entry's inode number and
+				 * reclen in ndp->ni_ufs area, and release
+				 * directory buffer.
+				 */
+				foundino = fs2h32(ep->e2d_ino);
+				results->ulr_reclen = fs2h16(ep->e2d_reclen);
+				goto found;
+			}
+		}
+		prevoff = results->ulr_offset;
+		results->ulr_offset += fs2h16(ep->e2d_reclen);
+		entryoffsetinblock += fs2h16(ep->e2d_reclen);
+		if (ep->e2d_ino)
+			enduseful = results->ulr_offset;
+	}
+/* notfound: */
+	/*
+	 * If we started in the middle of the directory and failed
+	 * to find our target, we must check the beginning as well.
+	 */
+	if (numdirpasses == 2) {
+		numdirpasses--;
+		results->ulr_offset = 0;
+		endsearch = results->ulr_diroff;
+		goto searchloop;
+	}
+	if (bp != NULL)
+		brelse(bp, 0);
+	/*
+	 * If creating, and at end of pathname and current
+	 * directory has not been removed, then can consider
+	 * allowing file to be created.
+	 */
+	if ((nameiop == CREATE || nameiop == RENAME) &&
+	    (flags & ISLASTCN) && dp->i_e2fs_nlink != 0) {
+		/*
+		 * Access for write is interpreted as allowing
+		 * creation of files in the directory.
+		 */
+		error = VOP_ACCESS(vdp, VWRITE, cred);
+		if (error)
+			return (error);
+		/*
+		 * Return an indication of where the new directory
+		 * entry should be put.  If we didn't find a slot,
+		 * then set results->ulr_count to 0 indicating
+		 * that the new slot belongs at the end of the
+		 * directory. If we found a slot, then the new entry
+		 * can be put in the range from results->ulr_offset to
+		 * results->ulr_offset + results->ulr_count.
+		 */
+		if (slotstatus == NONE) {
+			results->ulr_offset = roundup(ext2fs_size(dp), dirblksiz);
+			results->ulr_count = 0;
+			enduseful = results->ulr_offset;
+		} else {
+			results->ulr_offset = slotoffset;
+			results->ulr_count = slotsize;
+			if (enduseful < slotoffset + slotsize)
+				enduseful = slotoffset + slotsize;
+		}
+		results->ulr_endoff = roundup(enduseful, dirblksiz);
+#if 0
+		dp->i_flag |= IN_CHANGE | IN_UPDATE;
+#endif
+		/*
+		 * We return with the directory locked, so that
+		 * the parameters we set up above will still be
+		 * valid if we actually decide to do a direnter().
+		 * We return ni_vp == NULL to indicate that the entry
+		 * does not currently exist; we leave a pointer to
+		 * the (locked) directory inode in ndp->ni_dvp.
+		 *
+		 * NB - if the directory is unlocked, then this
+		 * information cannot be used.
+		 */
+		return (EJUSTRETURN);
+	}
+	/*
+	 * Insert name into cache (as non-existent) if appropriate.
+	 */
+	if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
+		cache_enter(vdp, *vpp, cnp);
+	return (ENOENT);
+
+found:
+	if (numdirpasses == 2)
+		nchstats.ncs_pass2++;
+	/*
+	 * Check that directory length properly reflects presence
+	 * of this entry.
+	 */
+	if (results->ulr_offset + EXT2FS_DIRSIZ(ep->e2d_namlen) > ext2fs_size(dp)) {
+		ufs_dirbad(dp, results->ulr_offset, "i_size too small");
+		error = ext2fs_setsize(dp,
+				results->ulr_offset + EXT2FS_DIRSIZ(ep->e2d_namlen));
+		if (error) {
+			brelse(bp, 0);
+			return (error);
+		}
+		dp->i_flag |= IN_CHANGE | IN_UPDATE;
+		uvm_vnp_setsize(vdp, ext2fs_size(dp));
+	}
+	brelse(bp, 0);
+
+	/*
+	 * Found component in pathname.
+	 * If the final component of path name, save information
+	 * in the cache as to where the entry was found.
+	 */
+	if ((flags & ISLASTCN) && nameiop == LOOKUP)
+		results->ulr_diroff = results->ulr_offset &~ (dirblksiz - 1);
+
+	/*
+	 * If deleting, and at end of pathname, return
+	 * parameters which can be used to remove file.
+	 * Lock the inode, being careful with ".".
+	 */
+	if (nameiop == DELETE && (flags & ISLASTCN)) {
+		/*
+		 * Write access to directory required to delete files.
+		 */
+		if ((error = VOP_ACCESS(vdp, VWRITE, cred)) != 0)
+			return (error);
+		/*
+		 * Return pointer to current entry in results->ulr_offset,
+		 * and distance past previous entry (if there
+		 * is a previous entry in this block) in results->ulr_count.
+		 * Save directory inode pointer in ndp->ni_dvp for dirremove().
+		 */
+		if ((results->ulr_offset & (dirblksiz - 1)) == 0)
+			results->ulr_count = 0;
+		else
+			results->ulr_count = results->ulr_offset - prevoff;
+		if (dp->i_number == foundino) {
+			vref(vdp);
+			*vpp = vdp;
+			return (0);
+		}
+		if (flags & ISDOTDOT)
+			VOP_UNLOCK(vdp); /* race to get the inode */
+		error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+		if (flags & ISDOTDOT)
+			vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY);
+		if (error)
+			return (error);
+		/*
+		 * If directory is "sticky", then user must own
+		 * the directory, or the file in it, else she
+		 * may not delete it (unless she's root). This
+		 * implements append-only directories.
+		 */
+		if ((dp->i_e2fs_mode & ISVTX) &&
+		    kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL) &&
+		    kauth_cred_geteuid(cred) != dp->i_uid &&
+		    VTOI(tdp)->i_uid != kauth_cred_geteuid(cred)) {
+			vput(tdp);
+			return (EPERM);
+		}
+		*vpp = tdp;
+		return (0);
+	}
+
+	/*
+	 * If rewriting (RENAME), return the inode and the
+	 * information required to rewrite the present directory
+	 * Must get inode of directory entry to verify it's a
+	 * regular file, or empty directory.
+	 */
+	if (nameiop == RENAME && (flags & ISLASTCN)) {
+		error = VOP_ACCESS(vdp, VWRITE, cred);
+		if (error)
+			return (error);
+		/*
+		 * Careful about locking second inode.
+		 * This can only occur if the target is ".".
+		 */
+		if (dp->i_number == foundino)
+			return (EISDIR);
+		if (flags & ISDOTDOT)
+			VOP_UNLOCK(vdp); /* race to get the inode */
+		error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+		if (flags & ISDOTDOT)
+			vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY);
+		if (error)
+			return (error);
+		*vpp = tdp;
+		return (0);
+	}
+
+	/*
+	 * Step through the translation in the name.  We do not `vput' the
+	 * directory because we may need it again if a symbolic link
+	 * is relative to the current directory.  Instead we save it
+	 * unlocked as "pdp".  We must get the target inode before unlocking
+	 * the directory to insure that the inode will not be removed
+	 * before we get it.  We prevent deadlock by always fetching
+	 * inodes from the root, moving down the directory tree. Thus
+	 * when following backward pointers ".." we must unlock the
+	 * parent directory before getting the requested directory.
+	 * There is a potential race condition here if both the current
+	 * and parent directories are removed before the VFS_VGET for the
+	 * inode associated with ".." returns.  We hope that this occurs
+	 * infrequently since we cannot avoid this race condition without
+	 * implementing a sophisticated deadlock detection algorithm.
+	 * Note also that this simple deadlock detection scheme will not
+	 * work if the file system has any hard links other than ".."
+	 * that point backwards in the directory structure.
+	 */
+	pdp = vdp;
+	if (flags & ISDOTDOT) {
+		VOP_UNLOCK(pdp);	/* race to get the inode */
+		error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+		vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY);
+		if (error) {
+			return (error);
+		}
+		*vpp = tdp;
+	} else if (dp->i_number == foundino) {
+		vref(vdp);	/* we want ourself, ie "." */
+		*vpp = vdp;
+	} else {
+		error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+		if (error)
+			return (error);
+		*vpp = tdp;
+	}
+
+	/*
+	 * Insert name into cache if appropriate.
+	 */
+	if (cnp->cn_flags & MAKEENTRY)
+		cache_enter(vdp, *vpp, cnp);
+	return (0);
+}
+
+/*
+ * Do consistency checking on a directory entry:
+ *	record length must be multiple of 4
+ *	entry must fit in rest of its dirblksize block
+ *	record must be large enough to contain entry
+ *	name is not longer than EXT2FS_MAXNAMLEN
+ *	name must be as long as advertised, and null terminated
+ */
+/*
+ *	changed so that it confirms to ext2fs_check_dir_entry
+ */
+static int
+ext2fs_dirbadentry(struct vnode *dp, struct ext2fs_direct *de,
+		int entryoffsetinblock)
+{
+	struct ufsmount *ump = VFSTOUFS(dp->v_mount);
+	int dirblksiz = ump->um_dirblksiz;
+
+		const char *error_msg = NULL;
+		int reclen = fs2h16(de->e2d_reclen);
+		int namlen = de->e2d_namlen;
+
+		if (reclen < EXT2FS_DIRSIZ(1)) /* e2d_namlen = 1 */
+			error_msg = "rec_len is smaller than minimal";
+		else if (reclen % 4 != 0)
+			error_msg = "rec_len % 4 != 0";
+		else if (namlen > EXT2FS_MAXNAMLEN)
+			error_msg = "namlen > EXT2FS_MAXNAMLEN";
+		else if (reclen < EXT2FS_DIRSIZ(namlen))
+			error_msg = "reclen is too small for name_len";
+		else if (entryoffsetinblock + reclen > dirblksiz)
+			error_msg = "directory entry across blocks";
+		else if (fs2h32(de->e2d_ino) >
+		    VTOI(dp)->i_e2fs->e2fs.e2fs_icount)
+			error_msg = "inode out of bounds";
+
+		if (error_msg != NULL) {
+			printf( "bad directory entry: %s\n"
+			    "offset=%d, inode=%lu, rec_len=%d, name_len=%d \n",
+			    error_msg, entryoffsetinblock,
+			    (unsigned long) fs2h32(de->e2d_ino),
+			    reclen, namlen);
+			panic("ext2fs_dirbadentry");
+		}
+		return error_msg == NULL ? 0 : 1;
+}
+
+/*
+ * Write a directory entry after a call to namei, using the parameters
+ * that it left in nameidata.  The argument ip is the inode which the new
+ * directory entry will refer to.  Dvp is a pointer to the directory to
+ * be written, which was left locked by namei. Remaining parameters
+ * (ulr_offset, ulr_count) indicate how the space for the new
+ * entry is to be obtained.
+ */
+int
+ext2fs_direnter(struct inode *ip, struct vnode *dvp,
+		const struct ufs_lookup_results *ulr,
+		struct componentname *cnp)
+{
+	struct ext2fs_direct *ep, *nep;
+	struct inode *dp;
+	struct buf *bp;
+	struct ext2fs_direct newdir;
+	struct iovec aiov;
+	struct uio auio;
+	u_int dsize;
+	int error, loc, newentrysize, spacefree;
+	char *dirbuf;
+	struct ufsmount *ump = VFSTOUFS(dvp->v_mount);
+	int dirblksiz = ump->um_dirblksiz;
+
+	dp = VTOI(dvp);
+
+	newdir.e2d_ino = h2fs32(ip->i_number);
+	newdir.e2d_namlen = cnp->cn_namelen;
+	if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 &&
+	    (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) {
+		newdir.e2d_type = inot2ext2dt(IFTODT(ip->i_e2fs_mode));
+	} else {
+		newdir.e2d_type = 0;
+	}
+	memcpy(newdir.e2d_name, cnp->cn_nameptr, (unsigned)cnp->cn_namelen + 1);
+	newentrysize = EXT2FS_DIRSIZ(cnp->cn_namelen);
+	if (ulr->ulr_count == 0) {
+		/*
+		 * If ulr_count is 0, then namei could find no
+		 * space in the directory. Here, ulr_offset will
+		 * be on a directory block boundary and we will write the
+		 * new entry into a fresh block.
+		 */
+		if (ulr->ulr_offset & (dirblksiz - 1))
+			panic("ext2fs_direnter: newblk");
+		auio.uio_offset = ulr->ulr_offset;
+		newdir.e2d_reclen = h2fs16(dirblksiz);
+		auio.uio_resid = newentrysize;
+		aiov.iov_len = newentrysize;
+		aiov.iov_base = (void *)&newdir;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_rw = UIO_WRITE;
+		UIO_SETUP_SYSSPACE(&auio);
+		error = VOP_WRITE(dvp, &auio, IO_SYNC, cnp->cn_cred);
+		if (dirblksiz > dvp->v_mount->mnt_stat.f_bsize)
+			/* XXX should grow with balloc() */
+			panic("ext2fs_direnter: frag size");
+		else if (!error) {
+			error = ext2fs_setsize(dp,
+				roundup(ext2fs_size(dp), dirblksiz));
+			if (error)
+				return (error);
+			dp->i_flag |= IN_CHANGE;
+			uvm_vnp_setsize(dvp, ext2fs_size(dp));
+		}
+		return (error);
+	}
+
+	/*
+	 * If ulr_count is non-zero, then namei found space
+	 * for the new entry in the range ulr_offset to
+	 * ulr_offset + ulr_count in the directory.
+	 * To use this space, we may have to compact the entries located
+	 * there, by copying them together towards the beginning of the
+	 * block, leaving the free space in one usable chunk at the end.
+	 */
+
+	/*
+	 * Get the block containing the space for the new directory entry.
+	 */
+	if ((error = ext2fs_blkatoff(dvp, (off_t)ulr->ulr_offset, &dirbuf, &bp)) != 0)
+		return (error);
+	/*
+	 * Find space for the new entry. In the simple case, the entry at
+	 * offset base will have the space. If it does not, then namei
+	 * arranged that compacting the region ulr_offset to
+	 * ulr_offset + ulr_count would yield the
+	 * space.
+	 */
+	ep = (struct ext2fs_direct *)dirbuf;
+	dsize = EXT2FS_DIRSIZ(ep->e2d_namlen);
+	spacefree = fs2h16(ep->e2d_reclen) - dsize;
+	for (loc = fs2h16(ep->e2d_reclen); loc < ulr->ulr_count; ) {
+		nep = (struct ext2fs_direct *)(dirbuf + loc);
+		if (ep->e2d_ino) {
+			/* trim the existing slot */
+			ep->e2d_reclen = h2fs16(dsize);
+			ep = (struct ext2fs_direct *)((char *)ep + dsize);
+		} else {
+			/* overwrite; nothing there; header is ours */
+			spacefree += dsize;
+		}
+		dsize = EXT2FS_DIRSIZ(nep->e2d_namlen);
+		spacefree += fs2h16(nep->e2d_reclen) - dsize;
+		loc += fs2h16(nep->e2d_reclen);
+		memcpy((void *)ep, (void *)nep, dsize);
+	}
+	/*
+	 * Update the pointer fields in the previous entry (if any),
+	 * copy in the new entry, and write out the block.
+	 */
+	if (ep->e2d_ino == 0) {
+#ifdef DIAGNOSTIC
+		if (spacefree + dsize < newentrysize)
+			panic("ext2fs_direnter: compact1");
+#endif
+		newdir.e2d_reclen = h2fs16(spacefree + dsize);
+	} else {
+#ifdef DIAGNOSTIC
+		if (spacefree < newentrysize) {
+			printf("ext2fs_direnter: compact2 %u %u",
+			    (u_int)spacefree, (u_int)newentrysize);
+			panic("ext2fs_direnter: compact2");
+		}
+#endif
+		newdir.e2d_reclen = h2fs16(spacefree);
+		ep->e2d_reclen = h2fs16(dsize);
+		ep = (struct ext2fs_direct *)((char *)ep + dsize);
+	}
+	memcpy((void *)ep, (void *)&newdir, (u_int)newentrysize);
+	error = VOP_BWRITE(bp->b_vp, bp);
+	dp->i_flag |= IN_CHANGE | IN_UPDATE;
+	if (!error && ulr->ulr_endoff && ulr->ulr_endoff < ext2fs_size(dp))
+		error = ext2fs_truncate(dvp, (off_t)ulr->ulr_endoff, IO_SYNC,
+		    cnp->cn_cred);
+	return (error);
+}
+
+/*
+ * Remove a directory entry after a call to namei, using
+ * the auxiliary results it provided. The entry
+ * ulr_offset contains the offset into the directory of the
+ * entry to be eliminated.  The ulr_count field contains the
+ * size of the previous record in the directory.  If this
+ * is 0, the first entry is being deleted, so we need only
+ * zero the inode number to mark the entry as free.  If the
+ * entry is not the first in the directory, we must reclaim
+ * the space of the now empty record by adding the record size
+ * to the size of the previous entry.
+ */
+int
+ext2fs_dirremove(struct vnode *dvp, const struct ufs_lookup_results *ulr,
+		 struct componentname *cnp)
+{
+	struct inode *dp;
+	struct ext2fs_direct *ep;
+	struct buf *bp;
+	int error;
+
+	dp = VTOI(dvp);
+
+	if (ulr->ulr_count == 0) {
+		/*
+		 * First entry in block: set d_ino to zero.
+		 */
+		error = ext2fs_blkatoff(dvp, (off_t)ulr->ulr_offset,
+		    (void *)&ep, &bp);
+		if (error != 0)
+			return (error);
+		ep->e2d_ino = 0;
+		error = VOP_BWRITE(bp->b_vp, bp);
+		dp->i_flag |= IN_CHANGE | IN_UPDATE;
+		return (error);
+	}
+	/*
+	 * Collapse new free space into previous entry.
+	 */
+	error = ext2fs_blkatoff(dvp, (off_t)(ulr->ulr_offset - ulr->ulr_count),
+	    (void *)&ep, &bp);
+	if (error != 0)
+		return (error);
+	ep->e2d_reclen = h2fs16(fs2h16(ep->e2d_reclen) + ulr->ulr_reclen);
+	error = VOP_BWRITE(bp->b_vp, bp);
+	dp->i_flag |= IN_CHANGE | IN_UPDATE;
+	return (error);
+}
+
+/*
+ * Rewrite an existing directory entry to point at the inode
+ * supplied.  The parameters describing the directory entry are
+ * set up by a call to namei.
+ */
+int
+ext2fs_dirrewrite(struct inode *dp, const struct ufs_lookup_results *ulr,
+    struct inode *ip, struct componentname *cnp)
+{
+	struct buf *bp;
+	struct ext2fs_direct *ep;
+	struct vnode *vdp = ITOV(dp);
+	int error;
+
+	error = ext2fs_blkatoff(vdp, (off_t)ulr->ulr_offset, (void *)&ep, &bp);
+	if (error != 0)
+		return (error);
+	ep->e2d_ino = h2fs32(ip->i_number);
+	if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 &&
+	    (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) {
+		ep->e2d_type = inot2ext2dt(IFTODT(ip->i_e2fs_mode));
+	} else {
+		ep->e2d_type = 0;
+	}
+	error = VOP_BWRITE(bp->b_vp, bp);
+	dp->i_flag |= IN_CHANGE | IN_UPDATE;
+	return (error);
+}
+
+/*
+ * Check if a directory is empty or not.
+ * Inode supplied must be locked.
+ *
+ * Using a struct dirtemplate here is not precisely
+ * what we want, but better than using a struct ext2fs_direct.
+ *
+ * NB: does not handle corrupted directories.
+ */
+int
+ext2fs_dirempty(struct inode *ip, ino_t parentino, kauth_cred_t cred)
+{
+	off_t off;
+	struct ext2fs_dirtemplate dbuf;
+	struct ext2fs_direct *dp = (struct ext2fs_direct *)&dbuf;
+	int error, namlen;
+	size_t count;
+
+#define	MINDIRSIZ (sizeof (struct ext2fs_dirtemplate) / 2)
+
+	for (off = 0; off < ext2fs_size(ip); off += fs2h16(dp->e2d_reclen)) {
+		error = vn_rdwr(UIO_READ, ITOV(ip), (void *)dp, MINDIRSIZ, off,
+		   UIO_SYSSPACE, IO_NODELOCKED, cred, &count, NULL);
+		/*
+		 * Since we read MINDIRSIZ, residual must
+		 * be 0 unless we're at end of file.
+		 */
+		if (error || count != 0)
+			return (0);
+		/* avoid infinite loops */
+		if (dp->e2d_reclen == 0)
+			return (0);
+		/* skip empty entries */
+		if (dp->e2d_ino == 0)
+			continue;
+		/* accept only "." and ".." */
+		namlen = dp->e2d_namlen;
+		if (namlen > 2)
+			return (0);
+		if (dp->e2d_name[0] != '.')
+			return (0);
+		/*
+		 * At this point namlen must be 1 or 2.
+		 * 1 implies ".", 2 implies ".." if second
+		 * char is also "."
+		 */
+		if (namlen == 1)
+			continue;
+		if (dp->e2d_name[1] == '.' && fs2h32(dp->e2d_ino) == parentino)
+			continue;
+		return (0);
+	}
+	return (1);
+}
+
+/*
+ * Check if source directory is in the path of the target directory.
+ * Target is supplied locked, source is unlocked.
+ * The target is always vput before returning.
+ */
+int
+ext2fs_checkpath(struct inode *source, struct inode *target,
+	kauth_cred_t cred)
+{
+	struct vnode *vp;
+	int error, rootino, namlen;
+	struct ext2fs_dirtemplate dirbuf;
+	uint32_t ino;
+
+	vp = ITOV(target);
+	if (target->i_number == source->i_number) {
+		error = EEXIST;
+		goto out;
+	}
+	rootino = ROOTINO;
+	error = 0;
+	if (target->i_number == rootino)
+		goto out;
+
+	for (;;) {
+		if (vp->v_type != VDIR) {
+			error = ENOTDIR;
+			break;
+		}
+		error = vn_rdwr(UIO_READ, vp, (void *)&dirbuf,
+			sizeof (struct ext2fs_dirtemplate), (off_t)0,
+			UIO_SYSSPACE, IO_NODELOCKED, cred, (size_t *)0,
+			NULL);
+		if (error != 0)
+			break;
+		namlen = dirbuf.dotdot_namlen;
+		if (namlen != 2 ||
+			dirbuf.dotdot_name[0] != '.' ||
+			dirbuf.dotdot_name[1] != '.') {
+			error = ENOTDIR;
+			break;
+		}
+		ino = fs2h32(dirbuf.dotdot_ino);
+		if (ino == source->i_number) {
+			error = EINVAL;
+			break;
+		}
+		if (ino == rootino)
+			break;
+		vput(vp);
+		error = VFS_VGET(vp->v_mount, ino, &vp);
+		if (error != 0) {
+			vp = NULL;
+			break;
+		}
+	}
+
+out:
+	if (error == ENOTDIR) {
+		printf("checkpath: .. not a directory\n");
+		panic("checkpath");
+	}
+	if (vp != NULL)
+		vput(vp);
+	return (error);
+}
diff --git a/sys/ufs/ext2fs/ext2fs_readwrite.c b/sys/ufs/ext2fs/ext2fs_readwrite.c
new file mode 100644
index 000000000..0b6f8d617
--- /dev/null
+++ b/sys/ufs/ext2fs/ext2fs_readwrite.c
@@ -0,0 +1,392 @@
+/*	$NetBSD: ext2fs_readwrite.c,v 1.58 2011/11/18 21:18:51 christos Exp $	*/
+
+/*-
+ * Copyright (c) 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_readwrite.c	8.8 (Berkeley) 8/4/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*-
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *	@(#)ufs_readwrite.c	8.8 (Berkeley) 8/4/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_readwrite.c,v 1.58 2011/11/18 21:18:51 christos Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/signalvar.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+
+#define doclusterread 0 /* XXX underway */
+#define doclusterwrite 0
+
+/*
+ * Vnode op for reading.
+ */
+/* ARGSUSED */
+int
+ext2fs_read(void *v)
+{
+	struct vop_read_args /* {
+		struct vnode *a_vp;
+		struct uio *a_uio;
+		int a_ioflag;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+	struct vnode *vp;
+	struct inode *ip;
+	struct uio *uio;
+	struct m_ext2fs *fs;
+	struct buf *bp;
+	struct ufsmount *ump;
+	vsize_t bytelen;
+	daddr_t lbn, nextlbn;
+	off_t bytesinfile;
+	long size, xfersize, blkoffset;
+	int error;
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	ump = ip->i_ump;
+	uio = ap->a_uio;
+	error = 0;
+
+#ifdef DIAGNOSTIC
+	if (uio->uio_rw != UIO_READ)
+		panic("%s: mode", "ext2fs_read");
+
+	if (vp->v_type == VLNK) {
+		if (ext2fs_size(ip) < ump->um_maxsymlinklen ||
+		    (ump->um_maxsymlinklen == 0 && ip->i_e2fs_nblock == 0))
+			panic("%s: short symlink", "ext2fs_read");
+	} else if (vp->v_type != VREG && vp->v_type != VDIR)
+		panic("%s: type %d", "ext2fs_read", vp->v_type);
+#endif
+	fs = ip->i_e2fs;
+	if ((uint64_t)uio->uio_offset > ump->um_maxfilesize)
+		return (EFBIG);
+	if (uio->uio_resid == 0)
+		return (0);
+	if (uio->uio_offset >= ext2fs_size(ip))
+		goto out;
+
+	if (vp->v_type == VREG) {
+		const int advice = IO_ADV_DECODE(ap->a_ioflag);
+
+		while (uio->uio_resid > 0) {
+			bytelen = MIN(ext2fs_size(ip) - uio->uio_offset,
+			    uio->uio_resid);
+			if (bytelen == 0)
+				break;
+
+			error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
+			    UBC_READ | UBC_PARTIALOK | UBC_UNMAP_FLAG(vp));
+			if (error)
+				break;
+		}
+		goto out;
+	}
+
+	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
+		bytesinfile = ext2fs_size(ip) - uio->uio_offset;
+		if (bytesinfile <= 0)
+			break;
+		lbn = lblkno(fs, uio->uio_offset);
+		nextlbn = lbn + 1;
+		size = fs->e2fs_bsize;
+		blkoffset = blkoff(fs, uio->uio_offset);
+		xfersize = fs->e2fs_bsize - blkoffset;
+		if (uio->uio_resid < xfersize)
+			xfersize = uio->uio_resid;
+		if (bytesinfile < xfersize)
+			xfersize = bytesinfile;
+
+		if (lblktosize(fs, nextlbn) >= ext2fs_size(ip))
+			error = bread(vp, lbn, size, NOCRED, 0, &bp);
+		else {
+			int nextsize = fs->e2fs_bsize;
+			error = breadn(vp, lbn,
+				size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
+		}
+		if (error)
+			break;
+
+		/*
+		 * We should only get non-zero b_resid when an I/O error
+		 * has occurred, which should cause us to break above.
+		 * However, if the short read did not cause an error,
+		 * then we want to ensure that we do not uiomove bad
+		 * or uninitialized data.
+		 */
+		size -= bp->b_resid;
+		if (size < xfersize) {
+			if (size == 0)
+				break;
+			xfersize = size;
+		}
+		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+		if (error)
+			break;
+		brelse(bp, 0);
+	}
+	if (bp != NULL)
+		brelse(bp, 0);
+
+out:
+	if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
+		ip->i_flag |= IN_ACCESS;
+		if ((ap->a_ioflag & IO_SYNC) == IO_SYNC)
+			error = ext2fs_update(vp, NULL, NULL, UPDATE_WAIT);
+	}
+	return (error);
+}
+
+/*
+ * Vnode op for writing.
+ */
+int
+ext2fs_write(void *v)
+{
+	struct vop_write_args /* {
+		struct vnode *a_vp;
+		struct uio *a_uio;
+		int a_ioflag;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+	struct vnode *vp;
+	struct uio *uio;
+	struct inode *ip;
+	struct m_ext2fs *fs;
+	struct buf *bp;
+	struct ufsmount *ump;
+	daddr_t lbn;
+	off_t osize;
+	int blkoffset, error, flags, ioflag, resid, xfersize;
+	vsize_t bytelen;
+	off_t oldoff = 0;					/* XXX */
+	bool async;
+	int extended = 0;
+	int advice;
+
+	ioflag = ap->a_ioflag;
+	advice = IO_ADV_DECODE(ioflag);
+	uio = ap->a_uio;
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	ump = ip->i_ump;
+	error = 0;
+
+#ifdef DIAGNOSTIC
+	if (uio->uio_rw != UIO_WRITE)
+		panic("%s: mode", "ext2fs_write");
+#endif
+
+	switch (vp->v_type) {
+	case VREG:
+		if (ioflag & IO_APPEND)
+			uio->uio_offset = ext2fs_size(ip);
+		if ((ip->i_e2fs_flags & EXT2_APPEND) &&
+		    uio->uio_offset != ext2fs_size(ip))
+			return (EPERM);
+		/* FALLTHROUGH */
+	case VLNK:
+		break;
+	case VDIR:
+		if ((ioflag & IO_SYNC) == 0)
+			panic("%s: nonsync dir write", "ext2fs_write");
+		break;
+	default:
+		panic("%s: type", "ext2fs_write");
+	}
+
+	fs = ip->i_e2fs;
+	if (uio->uio_offset < 0 ||
+	    (uint64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize)
+		return (EFBIG);
+	if (uio->uio_resid == 0)
+		return (0);
+
+	async = vp->v_mount->mnt_flag & MNT_ASYNC;
+	resid = uio->uio_resid;
+	osize = ext2fs_size(ip);
+
+	if (vp->v_type == VREG) {
+		while (uio->uio_resid > 0) {
+			oldoff = uio->uio_offset;
+			blkoffset = blkoff(fs, uio->uio_offset);
+			bytelen = MIN(fs->e2fs_bsize - blkoffset,
+			    uio->uio_resid);
+
+			if (vp->v_size < oldoff + bytelen) {
+				uvm_vnp_setwritesize(vp, oldoff + bytelen);
+			}
+			error = ufs_balloc_range(vp, uio->uio_offset,
+			    bytelen, ap->a_cred, 0);
+			if (error)
+				break;
+			error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
+			    UBC_WRITE | UBC_UNMAP_FLAG(vp));
+			if (error)
+				break;
+
+			/*
+			 * update UVM's notion of the size now that we've
+			 * copied the data into the vnode's pages.
+			 */
+
+			if (vp->v_size < uio->uio_offset) {
+				uvm_vnp_setsize(vp, uio->uio_offset);
+				extended = 1;
+			}
+
+			/*
+			 * flush what we just wrote if necessary.
+			 * XXXUBC simplistic async flushing.
+			 */
+
+			if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
+				mutex_enter(vp->v_interlock);
+				error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
+				    (uio->uio_offset >> 16) << 16, PGO_CLEANIT);
+			}
+		}
+		if (error == 0 && ioflag & IO_SYNC) {
+			mutex_enter(vp->v_interlock);
+			error = VOP_PUTPAGES(vp, trunc_page(oldoff),
+			    round_page(blkroundup(fs, uio->uio_offset)),
+			    PGO_CLEANIT | PGO_SYNCIO);
+		}
+
+		goto out;
+	}
+
+	flags = ioflag & IO_SYNC ? B_SYNC : 0;
+	for (error = 0; uio->uio_resid > 0;) {
+		lbn = lblkno(fs, uio->uio_offset);
+		blkoffset = blkoff(fs, uio->uio_offset);
+		xfersize = MIN(fs->e2fs_bsize - blkoffset, uio->uio_resid);
+		if (xfersize < fs->e2fs_bsize)
+			flags |= B_CLRBUF;
+		else
+			flags &= ~B_CLRBUF;
+		error = ext2fs_balloc(ip,
+		    lbn, blkoffset + xfersize, ap->a_cred, &bp, flags);
+		if (error)
+			break;
+		if (ext2fs_size(ip) < uio->uio_offset + xfersize) {
+			error = ext2fs_setsize(ip, uio->uio_offset + xfersize);
+			if (error)
+				break;
+		}
+		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+
+		/*
+		 * update UVM's notion of the size now that we've
+		 * copied the data into the vnode's pages.
+		 */
+
+		if (vp->v_size < uio->uio_offset) {
+			uvm_vnp_setsize(vp, uio->uio_offset);
+			extended = 1;
+		}
+
+		if (ioflag & IO_SYNC)
+			(void)bwrite(bp);
+		else if (xfersize + blkoffset == fs->e2fs_bsize)
+			bawrite(bp);
+		else
+			bdwrite(bp);
+		if (error || xfersize == 0)
+			break;
+	}
+
+	/*
+	 * If we successfully wrote any data, and we are not the superuser
+	 * we clear the setuid and setgid bits as a precaution against
+	 * tampering.
+	 */
+
+out:
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	if (vp->v_mount->mnt_flag & MNT_RELATIME)
+		ip->i_flag |= IN_ACCESS;
+	if (resid > uio->uio_resid && ap->a_cred &&
+	    kauth_authorize_generic(ap->a_cred, KAUTH_GENERIC_ISSUSER, NULL))
+		ip->i_e2fs_mode &= ~(ISUID | ISGID);
+	if (resid > uio->uio_resid)
+		VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
+	if (error) {
+		(void) ext2fs_truncate(vp, osize, ioflag & IO_SYNC, ap->a_cred);
+		uio->uio_offset -= resid - uio->uio_resid;
+		uio->uio_resid = resid;
+	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
+		error = ext2fs_update(vp, NULL, NULL, UPDATE_WAIT);
+	KASSERT(vp->v_size == ext2fs_size(ip));
+	return (error);
+}
diff --git a/sys/ufs/ext2fs/ext2fs_subr.c b/sys/ufs/ext2fs/ext2fs_subr.c
new file mode 100644
index 000000000..64f4c9f2c
--- /dev/null
+++ b/sys/ufs/ext2fs/ext2fs_subr.c
@@ -0,0 +1,137 @@
+/*	$NetBSD: ext2fs_subr.c,v 1.27 2009/10/19 18:41:17 bouyer Exp $	*/
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ffs_subr.c	8.2 (Berkeley) 9/21/93
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *	@(#)ffs_subr.c	8.2 (Berkeley) 9/21/93
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_subr.c,v 1.27 2009/10/19 18:41:17 bouyer Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/inttypes.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+/*
+ * Return buffer with the contents of block "offset" from the beginning of
+ * directory "ip".  If "res" is non-zero, fill it in with a pointer to the
+ * remaining space in the directory.
+ */
+int
+ext2fs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp)
+{
+	struct inode *ip;
+	struct m_ext2fs *fs;
+	struct buf *bp;
+	daddr_t lbn;
+	int error;
+
+	ip = VTOI(vp);
+	fs = ip->i_e2fs;
+	lbn = lblkno(fs, offset);
+
+	*bpp = NULL;
+	if ((error = bread(vp, lbn, fs->e2fs_bsize, NOCRED, 0, &bp)) != 0) {
+		brelse(bp, 0);
+		return (error);
+	}
+	if (res)
+		*res = (char *)bp->b_data + blkoff(fs, offset);
+	*bpp = bp;
+	return (0);
+}
+
+void
+ext2fs_itimes(struct inode *ip, const struct timespec *acc,
+    const struct timespec *mod, const struct timespec *cre)
+{
+	struct timespec now;
+
+	if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) {
+		return;
+	}
+
+	vfs_timestamp(&now);
+	if (ip->i_flag & IN_ACCESS) {
+		if (acc == NULL)
+			acc = &now;
+		ip->i_e2fs_atime = acc->tv_sec;
+	}
+	if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) {
+		if (mod == NULL)
+			mod = &now;
+		ip->i_e2fs_mtime = mod->tv_sec;
+		ip->i_modrev++;
+	}
+	if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) {
+		if (cre == NULL)
+			cre = &now;
+		ip->i_e2fs_ctime = cre->tv_sec;
+	}
+	if (ip->i_flag & (IN_ACCESS | IN_MODIFY))
+		ip->i_flag |= IN_ACCESSED;
+	if (ip->i_flag & (IN_UPDATE | IN_CHANGE))
+		ip->i_flag |= IN_MODIFIED;
+	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
+}
diff --git a/sys/ufs/ext2fs/ext2fs_vfsops.c b/sys/ufs/ext2fs/ext2fs_vfsops.c
new file mode 100644
index 000000000..76f6dd5a5
--- /dev/null
+++ b/sys/ufs/ext2fs/ext2fs_vfsops.c
@@ -0,0 +1,1266 @@
+/*	$NetBSD: ext2fs_vfsops.c,v 1.162 2011/11/14 18:35:14 hannken Exp $	*/
+
+/*
+ * Copyright (c) 1989, 1991, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ffs_vfsops.c	8.14 (Berkeley) 11/28/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *	@(#)ffs_vfsops.c	8.14 (Berkeley) 11/28/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_vfsops.c,v 1.162 2011/11/14 18:35:14 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_compat_netbsd.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/socket.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+#include <sys/device.h>
+#include <sys/mbuf.h>
+#include <sys/file.h>
+#include <sys/disklabel.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/lock.h>
+#include <sys/conf.h>
+#include <sys/kauth.h>
+#include <sys/module.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_dir.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+MODULE(MODULE_CLASS_VFS, ext2fs, "ffs");
+
+int ext2fs_sbupdate(struct ufsmount *, int);
+static int ext2fs_checksb(struct ext2fs *, int);
+
+static struct sysctllog *ext2fs_sysctl_log;
+
+extern const struct vnodeopv_desc ext2fs_vnodeop_opv_desc;
+extern const struct vnodeopv_desc ext2fs_specop_opv_desc;
+extern const struct vnodeopv_desc ext2fs_fifoop_opv_desc;
+
+const struct vnodeopv_desc * const ext2fs_vnodeopv_descs[] = {
+	&ext2fs_vnodeop_opv_desc,
+	&ext2fs_specop_opv_desc,
+	&ext2fs_fifoop_opv_desc,
+	NULL,
+};
+
+struct vfsops ext2fs_vfsops = {
+	MOUNT_EXT2FS,
+	sizeof (struct ufs_args),
+	ext2fs_mount,
+	ufs_start,
+	ext2fs_unmount,
+	ufs_root,
+	ufs_quotactl,
+	ext2fs_statvfs,
+	ext2fs_sync,
+	ext2fs_vget,
+	ext2fs_fhtovp,
+	ext2fs_vptofh,
+	ext2fs_init,
+	ext2fs_reinit,
+	ext2fs_done,
+	ext2fs_mountroot,
+	(int (*)(struct mount *, struct vnode *, struct timespec *)) eopnotsupp,
+	vfs_stdextattrctl,
+	(void *)eopnotsupp,	/* vfs_suspendctl */
+	genfs_renamelock_enter,
+	genfs_renamelock_exit,
+	(void *)eopnotsupp,
+	ext2fs_vnodeopv_descs,
+	0,
+	{ NULL, NULL },
+};
+
+static const struct genfs_ops ext2fs_genfsops = {
+	.gop_size = genfs_size,
+	.gop_alloc = ext2fs_gop_alloc,
+	.gop_write = genfs_gop_write,
+	.gop_markupdate = ufs_gop_markupdate,
+};
+
+static const struct ufs_ops ext2fs_ufsops = {
+	.uo_itimes = ext2fs_itimes,
+	.uo_update = ext2fs_update,
+	.uo_vfree = ext2fs_vfree,
+	.uo_unmark_vnode = (void (*)(vnode_t *))nullop,
+};
+
+/* Fill in the inode uid/gid from ext2 halves.  */
+void
+ext2fs_set_inode_guid(struct inode *ip)
+{
+
+	ip->i_gid = ip->i_e2fs_gid;
+	ip->i_uid = ip->i_e2fs_uid;
+	if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) {
+		ip->i_gid |= ip->i_e2fs_gid_high << 16;
+		ip->i_uid |= ip->i_e2fs_uid_high << 16;
+	}
+}
+
+static int
+ext2fs_modcmd(modcmd_t cmd, void *arg)
+{
+	int error;
+
+	switch (cmd) {
+	case MODULE_CMD_INIT:
+		error = vfs_attach(&ext2fs_vfsops);
+		if (error != 0)
+			break;
+		sysctl_createv(&ext2fs_sysctl_log, 0, NULL, NULL,
+			       CTLFLAG_PERMANENT,
+			       CTLTYPE_NODE, "vfs", NULL,
+			       NULL, 0, NULL, 0,
+			       CTL_VFS, CTL_EOL);
+		sysctl_createv(&ext2fs_sysctl_log, 0, NULL, NULL,
+			       CTLFLAG_PERMANENT,
+			       CTLTYPE_NODE, "ext2fs",
+			       SYSCTL_DESCR("Linux EXT2FS file system"),
+			       NULL, 0, NULL, 0,
+			       CTL_VFS, 17, CTL_EOL);
+		/*
+		 * XXX the "17" above could be dynamic, thereby eliminating
+		 * one more instance of the "number to vfs" mapping problem,
+		 * but "17" is the order as taken from sys/mount.h
+		 */
+		break;
+	case MODULE_CMD_FINI:
+		error = vfs_detach(&ext2fs_vfsops);
+		if (error != 0)
+			break;
+		sysctl_teardown(&ext2fs_sysctl_log);
+		break;
+	default:
+		error = ENOTTY;
+		break;
+	}
+
+	return (error);
+}
+
+/*
+ * XXX Same structure as FFS inodes?  Should we share a common pool?
+ */
+struct pool ext2fs_inode_pool;
+struct pool ext2fs_dinode_pool;
+
+extern u_long ext2gennumber;
+
+void
+ext2fs_init(void)
+{
+
+	pool_init(&ext2fs_inode_pool, sizeof(struct inode), 0, 0, 0,
+	    "ext2fsinopl", &pool_allocator_nointr, IPL_NONE);
+	pool_init(&ext2fs_dinode_pool, sizeof(struct ext2fs_dinode), 0, 0, 0,
+	    "ext2dinopl", &pool_allocator_nointr, IPL_NONE);
+	ufs_init();
+}
+
+void
+ext2fs_reinit(void)
+{
+	ufs_reinit();
+}
+
+void
+ext2fs_done(void)
+{
+
+	ufs_done();
+	pool_destroy(&ext2fs_inode_pool);
+	pool_destroy(&ext2fs_dinode_pool);
+}
+
+/*
+ * Called by main() when ext2fs is going to be mounted as root.
+ *
+ * Name is updated by mount(8) after booting.
+ */
+#define ROOTNAME	"root_device"
+
+int
+ext2fs_mountroot(void)
+{
+	extern struct vnode *rootvp;
+	struct m_ext2fs *fs;
+	struct mount *mp;
+	struct ufsmount *ump;
+	int error;
+
+	if (device_class(root_device) != DV_DISK)
+		return (ENODEV);
+
+	if ((error = vfs_rootmountalloc(MOUNT_EXT2FS, "root_device", &mp))) {
+		vrele(rootvp);
+		return (error);
+	}
+
+	if ((error = ext2fs_mountfs(rootvp, mp)) != 0) {
+		vfs_unbusy(mp, false, NULL);
+		vfs_destroy(mp);
+		return (error);
+	}
+	mutex_enter(&mountlist_lock);
+	CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+	mutex_exit(&mountlist_lock);
+	ump = VFSTOUFS(mp);
+	fs = ump->um_e2fs;
+	memset(fs->e2fs_fsmnt, 0, sizeof(fs->e2fs_fsmnt));
+	(void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs_fsmnt,
+	    sizeof(fs->e2fs_fsmnt) - 1, 0);
+	if (fs->e2fs.e2fs_rev > E2FS_REV0) {
+		memset(fs->e2fs.e2fs_fsmnt, 0, sizeof(fs->e2fs.e2fs_fsmnt));
+		(void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs.e2fs_fsmnt,
+		    sizeof(fs->e2fs.e2fs_fsmnt) - 1, 0);
+	}
+	(void)ext2fs_statvfs(mp, &mp->mnt_stat);
+	vfs_unbusy(mp, false, NULL);
+	setrootfstime((time_t)fs->e2fs.e2fs_wtime);
+	return (0);
+}
+
+/*
+ * VFS Operations.
+ *
+ * mount system call
+ */
+int
+ext2fs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
+{
+	struct lwp *l = curlwp;
+	struct vnode *devvp;
+	struct ufs_args *args = data;
+	struct ufsmount *ump = NULL;
+	struct m_ext2fs *fs;
+	size_t size;
+	int error = 0, flags, update;
+	mode_t accessmode;
+
+	if (*data_len < sizeof *args)
+		return EINVAL;
+
+	if (mp->mnt_flag & MNT_GETARGS) {
+		ump = VFSTOUFS(mp);
+		if (ump == NULL)
+			return EIO;
+		memset(args, 0, sizeof *args);
+		args->fspec = NULL;
+		*data_len = sizeof *args;
+		return 0;
+	}
+
+	update = mp->mnt_flag & MNT_UPDATE;
+
+	/* Check arguments */
+	if (args->fspec != NULL) {
+		/*
+		 * Look up the name and verify that it's sane.
+		 */
+		error = namei_simple_user(args->fspec,
+					NSM_FOLLOW_NOEMULROOT, &devvp);
+		if (error != 0)
+			return (error);
+
+		if (!update) {
+			/*
+			 * Be sure this is a valid block device
+			 */
+			if (devvp->v_type != VBLK)
+				error = ENOTBLK;
+			else if (bdevsw_lookup(devvp->v_rdev) == NULL)
+				error = ENXIO;
+		} else {
+		        /*
+			 * Be sure we're still naming the same device
+			 * used for our initial mount
+			 */
+			ump = VFSTOUFS(mp);
+			if (devvp != ump->um_devvp) {
+				if (devvp->v_rdev != ump->um_devvp->v_rdev)
+					error = EINVAL;
+				else {
+					vrele(devvp);
+					devvp = ump->um_devvp;
+					vref(devvp);
+				}
+			}
+		}
+	} else {
+		if (!update) {
+			/* New mounts must have a filename for the device */
+			return (EINVAL);
+		} else {
+			ump = VFSTOUFS(mp);
+			devvp = ump->um_devvp;
+			vref(devvp);
+		}
+	}
+
+	/*
+	 * If mount by non-root, then verify that user has necessary
+	 * permissions on the device.
+	 *
+	 * Permission to update a mount is checked higher, so here we presume
+	 * updating the mount is okay (for example, as far as securelevel goes)
+	 * which leaves us with the normal check.
+	 */
+	if (error == 0) {
+		accessmode = VREAD;
+		if (update ?
+		    (mp->mnt_iflag & IMNT_WANTRDWR) != 0 :
+		    (mp->mnt_flag & MNT_RDONLY) == 0)
+			accessmode |= VWRITE;
+		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+		error = genfs_can_mount(devvp, accessmode, l->l_cred);
+		VOP_UNLOCK(devvp);
+	}
+
+	if (error) {
+		vrele(devvp);
+		return (error);
+	}
+
+	if (!update) {
+		int xflags;
+
+		if (mp->mnt_flag & MNT_RDONLY)
+			xflags = FREAD;
+		else
+			xflags = FREAD|FWRITE;
+		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+		error = VOP_OPEN(devvp, xflags, FSCRED);
+		VOP_UNLOCK(devvp);
+		if (error)
+			goto fail;
+		error = ext2fs_mountfs(devvp, mp);
+		if (error) {
+			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+			(void)VOP_CLOSE(devvp, xflags, NOCRED);
+			VOP_UNLOCK(devvp);
+			goto fail;
+		}
+
+		ump = VFSTOUFS(mp);
+		fs = ump->um_e2fs;
+	} else {
+		/*
+		 * Update the mount.
+		 */
+
+		/*
+		 * The initial mount got a reference on this
+		 * device, so drop the one obtained via
+		 * namei(), above.
+		 */
+		vrele(devvp);
+
+		ump = VFSTOUFS(mp);
+		fs = ump->um_e2fs;
+		if (fs->e2fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
+			/*
+			 * Changing from r/w to r/o
+			 */
+			flags = WRITECLOSE;
+			if (mp->mnt_flag & MNT_FORCE)
+				flags |= FORCECLOSE;
+			error = ext2fs_flushfiles(mp, flags);
+			if (error == 0 &&
+			    ext2fs_cgupdate(ump, MNT_WAIT) == 0 &&
+			    (fs->e2fs.e2fs_state & E2FS_ERRORS) == 0) {
+				fs->e2fs.e2fs_state = E2FS_ISCLEAN;
+				(void) ext2fs_sbupdate(ump, MNT_WAIT);
+			}
+			if (error)
+				return (error);
+			fs->e2fs_ronly = 1;
+		}
+
+		if (mp->mnt_flag & MNT_RELOAD) {
+			error = ext2fs_reload(mp, l->l_cred, l);
+			if (error)
+				return (error);
+		}
+
+		if (fs->e2fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) {
+			/*
+			 * Changing from read-only to read/write
+			 */
+			fs->e2fs_ronly = 0;
+			if (fs->e2fs.e2fs_state == E2FS_ISCLEAN)
+				fs->e2fs.e2fs_state = 0;
+			else
+				fs->e2fs.e2fs_state = E2FS_ERRORS;
+			fs->e2fs_fmod = 1;
+		}
+		if (args->fspec == NULL)
+			return 0;
+	}
+
+	error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
+	    UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+	(void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs_fsmnt,
+	    sizeof(fs->e2fs_fsmnt) - 1, &size);
+	memset(fs->e2fs_fsmnt + size, 0, sizeof(fs->e2fs_fsmnt) - size);
+	if (fs->e2fs.e2fs_rev > E2FS_REV0) {
+		(void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs.e2fs_fsmnt,
+		    sizeof(fs->e2fs.e2fs_fsmnt) - 1, &size);
+		memset(fs->e2fs.e2fs_fsmnt, 0,
+		    sizeof(fs->e2fs.e2fs_fsmnt) - size);
+	}
+	if (fs->e2fs_fmod != 0) {	/* XXX */
+		fs->e2fs_fmod = 0;
+		if (fs->e2fs.e2fs_state == 0)
+			fs->e2fs.e2fs_wtime = time_second;
+		else
+			printf("%s: file system not clean; please fsck(8)\n",
+				mp->mnt_stat.f_mntfromname);
+		(void) ext2fs_cgupdate(ump, MNT_WAIT);
+	}
+	return (error);
+
+fail:
+	vrele(devvp);
+	return (error);
+}
+
+/*
+ * Reload all incore data for a filesystem (used after running fsck on
+ * the root filesystem and finding things to fix). The filesystem must
+ * be mounted read-only.
+ *
+ * Things to do to update the mount:
+ *	1) invalidate all cached meta-data.
+ *	2) re-read superblock from disk.
+ *	3) re-read summary information from disk.
+ *	4) invalidate all inactive vnodes.
+ *	5) invalidate all cached file data.
+ *	6) re-read inode data for all active vnodes.
+ */
+int
+ext2fs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l)
+{
+	struct vnode *vp, *mvp, *devvp;
+	struct inode *ip;
+	struct buf *bp;
+	struct m_ext2fs *fs;
+	struct ext2fs *newfs;
+	int i, error;
+	void *cp;
+	struct ufsmount *ump;
+
+	if ((mp->mnt_flag & MNT_RDONLY) == 0)
+		return (EINVAL);
+
+	ump = VFSTOUFS(mp);
+	/*
+	 * Step 1: invalidate all cached meta-data.
+	 */
+	devvp = ump->um_devvp;
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+	error = vinvalbuf(devvp, 0, cred, l, 0, 0);
+	VOP_UNLOCK(devvp);
+	if (error)
+		panic("ext2fs_reload: dirty1");
+	/*
+	 * Step 2: re-read superblock from disk.
+	 */
+	error = bread(devvp, SBLOCK, SBSIZE, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return (error);
+	}
+	newfs = (struct ext2fs *)bp->b_data;
+	error = ext2fs_checksb(newfs, (mp->mnt_flag & MNT_RDONLY) != 0);
+	if (error) {
+		brelse(bp, 0);
+		return (error);
+	}
+
+	fs = ump->um_e2fs;
+	/*
+	 * copy in new superblock, and compute in-memory values
+	 */
+	e2fs_sbload(newfs, &fs->e2fs);
+	fs->e2fs_ncg =
+	    howmany(fs->e2fs.e2fs_bcount - fs->e2fs.e2fs_first_dblock,
+	    fs->e2fs.e2fs_bpg);
+	fs->e2fs_fsbtodb = fs->e2fs.e2fs_log_bsize + LOG_MINBSIZE - DEV_BSHIFT;
+	fs->e2fs_bsize = MINBSIZE << fs->e2fs.e2fs_log_bsize;
+	fs->e2fs_bshift = LOG_MINBSIZE + fs->e2fs.e2fs_log_bsize;
+	fs->e2fs_qbmask = fs->e2fs_bsize - 1;
+	fs->e2fs_bmask = ~fs->e2fs_qbmask;
+	fs->e2fs_ngdb =
+	    howmany(fs->e2fs_ncg, fs->e2fs_bsize / sizeof(struct ext2_gd));
+	fs->e2fs_ipb = fs->e2fs_bsize / EXT2_DINODE_SIZE(fs);
+	fs->e2fs_itpg = fs->e2fs.e2fs_ipg / fs->e2fs_ipb;
+	brelse(bp, 0);
+
+	/*
+	 * Step 3: re-read summary information from disk.
+	 */
+
+	for (i = 0; i < fs->e2fs_ngdb; i++) {
+		error = bread(devvp ,
+		    fsbtodb(fs, fs->e2fs.e2fs_first_dblock +
+		    1 /* superblock */ + i),
+		    fs->e2fs_bsize, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp, 0);
+			return (error);
+		}
+		e2fs_cgload((struct ext2_gd *)bp->b_data,
+		    &fs->e2fs_gd[i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
+		    fs->e2fs_bsize);
+		brelse(bp, 0);
+	}
+
+	/* Allocate a marker vnode. */
+	mvp = vnalloc(mp);
+	/*
+	 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+	 * and vclean() can be called indirectly
+	 */
+	mutex_enter(&mntvnode_lock);
+loop:
+	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+		vmark(mvp, vp);
+		if (vp->v_mount != mp || vismarker(vp))
+			continue;
+		/*
+		 * Step 4: invalidate all inactive vnodes.
+		 */
+		if (vrecycle(vp, &mntvnode_lock, l)) {
+			mutex_enter(&mntvnode_lock);
+			(void)vunmark(mvp);
+			goto loop;
+		}
+		/*
+		 * Step 5: invalidate all cached file data.
+		 */
+		mutex_enter(vp->v_interlock);
+		mutex_exit(&mntvnode_lock);
+		if (vget(vp, LK_EXCLUSIVE)) {
+			mutex_enter(&mntvnode_lock);
+			(void)vunmark(mvp);
+			goto loop;
+		}
+		if (vinvalbuf(vp, 0, cred, l, 0, 0))
+			panic("ext2fs_reload: dirty2");
+		/*
+		 * Step 6: re-read inode data for all active vnodes.
+		 */
+		ip = VTOI(vp);
+		error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+		    (int)fs->e2fs_bsize, NOCRED, 0, &bp);
+		if (error) {
+			vput(vp);
+			mutex_enter(&mntvnode_lock);
+			(void)vunmark(mvp);
+			break;
+		}
+		cp = (char *)bp->b_data +
+		    (ino_to_fsbo(fs, ip->i_number) * EXT2_DINODE_SIZE(fs));
+		e2fs_iload((struct ext2fs_dinode *)cp, ip->i_din.e2fs_din);
+		ext2fs_set_inode_guid(ip);
+		brelse(bp, 0);
+		vput(vp);
+		mutex_enter(&mntvnode_lock);
+	}
+	mutex_exit(&mntvnode_lock);
+	vnfree(mvp);
+	return (error);
+}
+
+/*
+ * Common code for mount and mountroot
+ */
+int
+ext2fs_mountfs(struct vnode *devvp, struct mount *mp)
+{
+	struct lwp *l = curlwp;
+	struct ufsmount *ump;
+	struct buf *bp;
+	struct ext2fs *fs;
+	struct m_ext2fs *m_fs;
+	dev_t dev;
+	int error, i, ronly;
+	kauth_cred_t cred;
+	struct proc *p;
+
+	dev = devvp->v_rdev;
+	p = l ? l->l_proc : NULL;
+	cred = l ? l->l_cred : NOCRED;
+
+	/* Flush out any old buffers remaining from a previous use. */
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+	error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
+	VOP_UNLOCK(devvp);
+	if (error)
+		return (error);
+
+	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
+
+	bp = NULL;
+	ump = NULL;
+
+#ifdef DEBUG_EXT2
+	printf("ext2 sb size: %zu\n", sizeof(struct ext2fs));
+#endif
+	error = bread(devvp, SBLOCK, SBSIZE, cred, 0, &bp);
+	if (error)
+		goto out;
+	fs = (struct ext2fs *)bp->b_data;
+	error = ext2fs_checksb(fs, ronly);
+	if (error)
+		goto out;
+	ump = malloc(sizeof(*ump), M_UFSMNT, M_WAITOK);
+	memset(ump, 0, sizeof(*ump));
+	ump->um_fstype = UFS1;
+	ump->um_ops = &ext2fs_ufsops;
+	ump->um_e2fs = malloc(sizeof(struct m_ext2fs), M_UFSMNT, M_WAITOK);
+	memset(ump->um_e2fs, 0, sizeof(struct m_ext2fs));
+	e2fs_sbload((struct ext2fs *)bp->b_data, &ump->um_e2fs->e2fs);
+	brelse(bp, 0);
+	bp = NULL;
+	m_fs = ump->um_e2fs;
+	m_fs->e2fs_ronly = ronly;
+
+#ifdef DEBUG_EXT2
+	printf("ext2 ino size %zu\n", EXT2_DINODE_SIZE(m_fs));
+#endif
+	if (ronly == 0) {
+		if (m_fs->e2fs.e2fs_state == E2FS_ISCLEAN)
+			m_fs->e2fs.e2fs_state = 0;
+		else
+			m_fs->e2fs.e2fs_state = E2FS_ERRORS;
+		m_fs->e2fs_fmod = 1;
+	}
+
+	/* compute dynamic sb infos */
+	m_fs->e2fs_ncg =
+	    howmany(m_fs->e2fs.e2fs_bcount - m_fs->e2fs.e2fs_first_dblock,
+	    m_fs->e2fs.e2fs_bpg);
+	m_fs->e2fs_fsbtodb = m_fs->e2fs.e2fs_log_bsize + LOG_MINBSIZE - DEV_BSHIFT;
+	m_fs->e2fs_bsize = MINBSIZE << m_fs->e2fs.e2fs_log_bsize;
+	m_fs->e2fs_bshift = LOG_MINBSIZE + m_fs->e2fs.e2fs_log_bsize;
+	m_fs->e2fs_qbmask = m_fs->e2fs_bsize - 1;
+	m_fs->e2fs_bmask = ~m_fs->e2fs_qbmask;
+	m_fs->e2fs_ngdb =
+	    howmany(m_fs->e2fs_ncg, m_fs->e2fs_bsize / sizeof(struct ext2_gd));
+	m_fs->e2fs_ipb = m_fs->e2fs_bsize / EXT2_DINODE_SIZE(m_fs);
+	m_fs->e2fs_itpg = m_fs->e2fs.e2fs_ipg / m_fs->e2fs_ipb;
+
+	m_fs->e2fs_gd = malloc(m_fs->e2fs_ngdb * m_fs->e2fs_bsize,
+	    M_UFSMNT, M_WAITOK);
+	for (i = 0; i < m_fs->e2fs_ngdb; i++) {
+		error = bread(devvp ,
+		    fsbtodb(m_fs, m_fs->e2fs.e2fs_first_dblock +
+		    1 /* superblock */ + i),
+		    m_fs->e2fs_bsize, NOCRED, 0, &bp);
+		if (error) {
+			free(m_fs->e2fs_gd, M_UFSMNT);
+			goto out;
+		}
+		e2fs_cgload((struct ext2_gd *)bp->b_data,
+		    &m_fs->e2fs_gd[
+			i * m_fs->e2fs_bsize / sizeof(struct ext2_gd)],
+		    m_fs->e2fs_bsize);
+		brelse(bp, 0);
+		bp = NULL;
+	}
+
+	mp->mnt_data = ump;
+	mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
+	mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_EXT2FS);
+	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
+	mp->mnt_stat.f_namemax = EXT2FS_MAXNAMLEN;
+	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_dev_bshift = DEV_BSHIFT;	/* XXX */
+	mp->mnt_fs_bshift = m_fs->e2fs_bshift;
+	mp->mnt_iflag |= IMNT_DTYPE;
+	ump->um_flags = 0;
+	ump->um_mountp = mp;
+	ump->um_dev = dev;
+	ump->um_devvp = devvp;
+	ump->um_nindir = NINDIR(m_fs);
+	ump->um_lognindir = ffs(NINDIR(m_fs)) - 1;
+	ump->um_bptrtodb = m_fs->e2fs_fsbtodb;
+	ump->um_seqinc = 1; /* no frags */
+	ump->um_maxsymlinklen = EXT2_MAXSYMLINKLEN;
+	ump->um_dirblksiz = m_fs->e2fs_bsize;
+	ump->um_maxfilesize = ((uint64_t)0x80000000 * m_fs->e2fs_bsize - 1);
+	devvp->v_specmountpoint = mp;
+	return (0);
+
+out:
+	KASSERT(bp != NULL);
+	brelse(bp, 0);
+	if (ump) {
+		free(ump->um_e2fs, M_UFSMNT);
+		free(ump, M_UFSMNT);
+		mp->mnt_data = NULL;
+	}
+	return (error);
+}
+
+/*
+ * unmount system call
+ */
+int
+ext2fs_unmount(struct mount *mp, int mntflags)
+{
+	struct ufsmount *ump;
+	struct m_ext2fs *fs;
+	int error, flags;
+
+	flags = 0;
+	if (mntflags & MNT_FORCE)
+		flags |= FORCECLOSE;
+	if ((error = ext2fs_flushfiles(mp, flags)) != 0)
+		return (error);
+	ump = VFSTOUFS(mp);
+	fs = ump->um_e2fs;
+	if (fs->e2fs_ronly == 0 &&
+		ext2fs_cgupdate(ump, MNT_WAIT) == 0 &&
+		(fs->e2fs.e2fs_state & E2FS_ERRORS) == 0) {
+		fs->e2fs.e2fs_state = E2FS_ISCLEAN;
+		(void) ext2fs_sbupdate(ump, MNT_WAIT);
+	}
+	if (ump->um_devvp->v_type != VBAD)
+		ump->um_devvp->v_specmountpoint = NULL;
+	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+	error = VOP_CLOSE(ump->um_devvp, fs->e2fs_ronly ? FREAD : FREAD|FWRITE,
+	    NOCRED);
+	vput(ump->um_devvp);
+	free(fs->e2fs_gd, M_UFSMNT);
+	free(fs, M_UFSMNT);
+	free(ump, M_UFSMNT);
+	mp->mnt_data = NULL;
+	mp->mnt_flag &= ~MNT_LOCAL;
+	return (error);
+}
+
+/*
+ * Flush out all the files in a filesystem.
+ */
+int
+ext2fs_flushfiles(struct mount *mp, int flags)
+{
+	extern int doforce;
+	int error;
+
+	if (!doforce)
+		flags &= ~FORCECLOSE;
+	error = vflush(mp, NULLVP, flags);
+	return (error);
+}
+
+/*
+ * Get file system statistics.
+ */
+int
+ext2fs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+	struct ufsmount *ump;
+	struct m_ext2fs *fs;
+	uint32_t overhead, overhead_per_group, ngdb;
+	int i, ngroups;
+
+	ump = VFSTOUFS(mp);
+	fs = ump->um_e2fs;
+	if (fs->e2fs.e2fs_magic != E2FS_MAGIC)
+		panic("ext2fs_statvfs");
+
+	/*
+	 * Compute the overhead (FS structures)
+	 */
+	overhead_per_group =
+	    1 /* block bitmap */ +
+	    1 /* inode bitmap */ +
+	    fs->e2fs_itpg;
+	overhead = fs->e2fs.e2fs_first_dblock +
+	    fs->e2fs_ncg * overhead_per_group;
+	if (fs->e2fs.e2fs_rev > E2FS_REV0 &&
+	    fs->e2fs.e2fs_features_rocompat & EXT2F_ROCOMPAT_SPARSESUPER) {
+		for (i = 0, ngroups = 0; i < fs->e2fs_ncg; i++) {
+			if (cg_has_sb(i))
+				ngroups++;
+		}
+	} else {
+		ngroups = fs->e2fs_ncg;
+	}
+	ngdb = fs->e2fs_ngdb;
+	if (fs->e2fs.e2fs_rev > E2FS_REV0 &&
+	    fs->e2fs.e2fs_features_compat & EXT2F_COMPAT_RESIZE)
+		ngdb += fs->e2fs.e2fs_reserved_ngdb;
+	overhead += ngroups * (1 /* superblock */ + ngdb);
+
+	sbp->f_bsize = fs->e2fs_bsize;
+	sbp->f_frsize = MINBSIZE << fs->e2fs.e2fs_fsize;
+	sbp->f_iosize = fs->e2fs_bsize;
+	sbp->f_blocks = fs->e2fs.e2fs_bcount - overhead;
+	sbp->f_bfree = fs->e2fs.e2fs_fbcount;
+	sbp->f_bresvd = fs->e2fs.e2fs_rbcount;
+	if (sbp->f_bfree > sbp->f_bresvd)
+		sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd;
+	else
+		sbp->f_bavail = 0;
+	sbp->f_files =  fs->e2fs.e2fs_icount;
+	sbp->f_ffree = fs->e2fs.e2fs_ficount;
+	sbp->f_favail = fs->e2fs.e2fs_ficount;
+	sbp->f_fresvd = 0;
+	copy_statvfs_info(sbp, mp);
+	return (0);
+}
+
+/*
+ * Go through the disk queues to initiate sandbagged IO;
+ * go through the inodes to write those that have been modified;
+ * initiate the writing of the super block if it has been modified.
+ *
+ * Note: we are always called with the filesystem marked `MPBUSY'.
+ */
+int
+ext2fs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
+{
+	struct vnode *vp, *mvp;
+	struct inode *ip;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct m_ext2fs *fs;
+	int error, allerror = 0;
+
+	fs = ump->um_e2fs;
+	if (fs->e2fs_fmod != 0 && fs->e2fs_ronly != 0) {	/* XXX */
+		printf("fs = %s\n", fs->e2fs_fsmnt);
+		panic("update: rofs mod");
+	}
+
+	/* Allocate a marker vnode. */
+	mvp = vnalloc(mp);
+
+	/*
+	 * Write back each (modified) inode.
+	 */
+	mutex_enter(&mntvnode_lock);
+loop:
+	/*
+	 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+	 * and vclean() can be called indirectly
+	 */
+	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+		vmark(mvp, vp);
+		if (vp->v_mount != mp || vismarker(vp))
+			continue;
+		mutex_enter(vp->v_interlock);
+		ip = VTOI(vp);
+		if (ip == NULL || (vp->v_iflag & (VI_XLOCK|VI_CLEAN)) != 0 ||
+		    vp->v_type == VNON ||
+		    ((ip->i_flag &
+		      (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 &&
+		     LIST_EMPTY(&vp->v_dirtyblkhd) &&
+		     UVM_OBJ_IS_CLEAN(&vp->v_uobj)))
+		{
+			mutex_exit(vp->v_interlock);
+			continue;
+		}
+		mutex_exit(&mntvnode_lock);
+		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT);
+		if (error) {
+			mutex_enter(&mntvnode_lock);
+			if (error == ENOENT) {
+				mutex_enter(&mntvnode_lock);
+				(void)vunmark(mvp);
+				goto loop;
+			}
+			continue;
+		}
+		if (vp->v_type == VREG && waitfor == MNT_LAZY)
+			error = ext2fs_update(vp, NULL, NULL, 0);
+		else
+			error = VOP_FSYNC(vp, cred,
+			    waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0);
+		if (error)
+			allerror = error;
+		vput(vp);
+		mutex_enter(&mntvnode_lock);
+	}
+	mutex_exit(&mntvnode_lock);
+	vnfree(mvp);
+	/*
+	 * Force stale file system control information to be flushed.
+	 */
+	if (waitfor != MNT_LAZY) {
+		vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+		if ((error = VOP_FSYNC(ump->um_devvp, cred,
+		    waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0)) != 0)
+			allerror = error;
+		VOP_UNLOCK(ump->um_devvp);
+	}
+	/*
+	 * Write back modified superblock.
+	 */
+	if (fs->e2fs_fmod != 0) {
+		fs->e2fs_fmod = 0;
+		fs->e2fs.e2fs_wtime = time_second;
+		if ((error = ext2fs_cgupdate(ump, waitfor)))
+			allerror = error;
+	}
+	return (allerror);
+}
+
+/*
+ * Look up a EXT2FS dinode number to find its incore vnode, otherwise read it
+ * in from disk.  If it is in core, wait for the lock bit to clear, then
+ * return the inode locked.  Detection and handling of mount points must be
+ * done by the calling routine.
+ */
+int
+ext2fs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
+{
+	struct m_ext2fs *fs;
+	struct inode *ip;
+	struct ufsmount *ump;
+	struct buf *bp;
+	struct vnode *vp;
+	dev_t dev;
+	int error;
+	void *cp;
+
+	ump = VFSTOUFS(mp);
+	dev = ump->um_dev;
+retry:
+	if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL)
+		return (0);
+
+	/* Allocate a new vnode/inode. */
+	error = getnewvnode(VT_EXT2FS, mp, ext2fs_vnodeop_p, NULL, &vp);
+	if (error) {
+		*vpp = NULL;
+		return (error);
+	}
+	ip = pool_get(&ext2fs_inode_pool, PR_WAITOK);
+
+	mutex_enter(&ufs_hashlock);
+	if ((*vpp = ufs_ihashget(dev, ino, 0)) != NULL) {
+		mutex_exit(&ufs_hashlock);
+		ungetnewvnode(vp);
+		pool_put(&ext2fs_inode_pool, ip);
+		goto retry;
+	}
+
+	vp->v_vflag |= VV_LOCKSWORK;
+
+	memset(ip, 0, sizeof(struct inode));
+	vp->v_data = ip;
+	ip->i_vnode = vp;
+	ip->i_ump = ump;
+	ip->i_e2fs = fs = ump->um_e2fs;
+	ip->i_dev = dev;
+	ip->i_number = ino;
+	ip->i_e2fs_last_lblk = 0;
+	ip->i_e2fs_last_blk = 0;
+	genfs_node_init(vp, &ext2fs_genfsops);
+
+	/*
+	 * Put it onto its hash chain and lock it so that other requests for
+	 * this inode will block if they arrive while we are sleeping waiting
+	 * for old data structures to be purged or for the contents of the
+	 * disk portion of this inode to be read.
+	 */
+
+	ufs_ihashins(ip);
+	mutex_exit(&ufs_hashlock);
+
+	/* Read in the disk contents for the inode, copy into the inode. */
+	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
+	    (int)fs->e2fs_bsize, NOCRED, 0, &bp);
+	if (error) {
+
+		/*
+		 * The inode does not contain anything useful, so it would
+		 * be misleading to leave it on its hash chain. With mode
+		 * still zero, it will be unlinked and returned to the free
+		 * list by vput().
+		 */
+
+		vput(vp);
+		brelse(bp, 0);
+		*vpp = NULL;
+		return (error);
+	}
+	cp = (char *)bp->b_data + (ino_to_fsbo(fs, ino) * EXT2_DINODE_SIZE(fs));
+	ip->i_din.e2fs_din = pool_get(&ext2fs_dinode_pool, PR_WAITOK);
+	e2fs_iload((struct ext2fs_dinode *)cp, ip->i_din.e2fs_din);
+	ext2fs_set_inode_guid(ip);
+	brelse(bp, 0);
+
+	/* If the inode was deleted, reset all fields */
+	if (ip->i_e2fs_dtime != 0) {
+		ip->i_e2fs_mode = ip->i_e2fs_nblock = 0;
+		(void)ext2fs_setsize(ip, 0);
+		memset(ip->i_e2fs_blocks, 0, sizeof(ip->i_e2fs_blocks));
+	}
+
+	/*
+	 * Initialize the vnode from the inode, check for aliases.
+	 */
+
+	error = ext2fs_vinit(mp, ext2fs_specop_p, ext2fs_fifoop_p, &vp);
+	if (error) {
+		vput(vp);
+		*vpp = NULL;
+		return (error);
+	}
+	/*
+	 * Finish inode initialization now that aliasing has been resolved.
+	 */
+
+	ip->i_devvp = ump->um_devvp;
+	vref(ip->i_devvp);
+
+	/*
+	 * Set up a generation number for this inode if it does not
+	 * already have one. This should only happen on old filesystems.
+	 */
+
+	if (ip->i_e2fs_gen == 0) {
+		if (++ext2gennumber < (u_long)time_second)
+			ext2gennumber = time_second;
+		ip->i_e2fs_gen = ext2gennumber;
+		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
+			ip->i_flag |= IN_MODIFIED;
+	}
+	uvm_vnp_setsize(vp, ext2fs_size(ip));
+	*vpp = vp;
+	return (0);
+}
+
+/*
+ * File handle to vnode
+ *
+ * Have to be really careful about stale file handles:
+ * - check that the inode number is valid
+ * - call ext2fs_vget() to get the locked inode
+ * - check for an unallocated inode (i_mode == 0)
+ */
+int
+ext2fs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
+{
+	struct inode *ip;
+	struct vnode *nvp;
+	int error;
+	struct ufid ufh;
+	struct m_ext2fs *fs;
+
+	if (fhp->fid_len != sizeof(struct ufid))
+		return EINVAL;
+
+	memcpy(&ufh, fhp, sizeof(struct ufid));
+	fs = VFSTOUFS(mp)->um_e2fs;
+	if ((ufh.ufid_ino < EXT2_FIRSTINO && ufh.ufid_ino != EXT2_ROOTINO) ||
+		ufh.ufid_ino >= fs->e2fs_ncg * fs->e2fs.e2fs_ipg)
+		return (ESTALE);
+
+	if ((error = VFS_VGET(mp, ufh.ufid_ino, &nvp)) != 0) {
+		*vpp = NULLVP;
+		return (error);
+	}
+	ip = VTOI(nvp);
+	if (ip->i_e2fs_mode == 0 || ip->i_e2fs_dtime != 0 ||
+		ip->i_e2fs_gen != ufh.ufid_gen) {
+		vput(nvp);
+		*vpp = NULLVP;
+		return (ESTALE);
+	}
+	*vpp = nvp;
+	return (0);
+}
+
+/*
+ * Vnode pointer to File handle
+ */
+/* ARGSUSED */
+int
+ext2fs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
+{
+	struct inode *ip;
+	struct ufid ufh;
+
+	if (*fh_size < sizeof(struct ufid)) {
+		*fh_size = sizeof(struct ufid);
+		return E2BIG;
+	}
+	*fh_size = sizeof(struct ufid);
+
+	ip = VTOI(vp);
+	memset(&ufh, 0, sizeof(ufh));
+	ufh.ufid_len = sizeof(struct ufid);
+	ufh.ufid_ino = ip->i_number;
+	ufh.ufid_gen = ip->i_e2fs_gen;
+	memcpy(fhp, &ufh, sizeof(ufh));
+	return (0);
+}
+
+/*
+ * Write a superblock and associated information back to disk.
+ */
+int
+ext2fs_sbupdate(struct ufsmount *mp, int waitfor)
+{
+	struct m_ext2fs *fs = mp->um_e2fs;
+	struct buf *bp;
+	int error = 0;
+
+	bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0);
+	e2fs_sbsave(&fs->e2fs, (struct ext2fs*)bp->b_data);
+	if (waitfor == MNT_WAIT)
+		error = bwrite(bp);
+	else
+		bawrite(bp);
+	return (error);
+}
+
+int
+ext2fs_cgupdate(struct ufsmount *mp, int waitfor)
+{
+	struct m_ext2fs *fs = mp->um_e2fs;
+	struct buf *bp;
+	int i, error = 0, allerror = 0;
+
+	allerror = ext2fs_sbupdate(mp, waitfor);
+	for (i = 0; i < fs->e2fs_ngdb; i++) {
+		bp = getblk(mp->um_devvp, fsbtodb(fs,
+		    fs->e2fs.e2fs_first_dblock +
+		    1 /* superblock */ + i), fs->e2fs_bsize, 0, 0);
+		e2fs_cgsave(&fs->e2fs_gd[
+		    i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
+		    (struct ext2_gd *)bp->b_data, fs->e2fs_bsize);
+		if (waitfor == MNT_WAIT)
+			error = bwrite(bp);
+		else
+			bawrite(bp);
+	}
+
+	if (!allerror && error)
+		allerror = error;
+	return (allerror);
+}
+
+static int
+ext2fs_checksb(struct ext2fs *fs, int ronly)
+{
+
+	if (fs2h16(fs->e2fs_magic) != E2FS_MAGIC) {
+		return (EINVAL);		/* XXX needs translation */
+	}
+	if (fs2h32(fs->e2fs_rev) > E2FS_REV1) {
+#ifdef DIAGNOSTIC
+		printf("Ext2 fs: unsupported revision number: %x\n",
+		    fs2h32(fs->e2fs_rev));
+#endif
+		return (EINVAL);		/* XXX needs translation */
+	}
+	if (fs2h32(fs->e2fs_log_bsize) > 2) { /* block size = 1024|2048|4096 */
+#ifdef DIAGNOSTIC
+		printf("Ext2 fs: bad block size: %d "
+		    "(expected <= 2 for ext2 fs)\n",
+		    fs2h32(fs->e2fs_log_bsize));
+#endif
+		return (EINVAL);	   /* XXX needs translation */
+	}
+	if (fs2h32(fs->e2fs_rev) > E2FS_REV0) {
+		if (fs2h32(fs->e2fs_first_ino) != EXT2_FIRSTINO) {
+			printf("Ext2 fs: unsupported first inode position\n");
+			return (EINVAL);      /* XXX needs translation */
+		}
+		if (fs2h32(fs->e2fs_features_incompat) &
+		    ~EXT2F_INCOMPAT_SUPP) {
+			printf("Ext2 fs: unsupported optional feature\n");
+			return (EINVAL);      /* XXX needs translation */
+		}
+		if (!ronly && fs2h32(fs->e2fs_features_rocompat) &
+		    ~EXT2F_ROCOMPAT_SUPP) {
+			return (EROFS);      /* XXX needs translation */
+		}
+	}
+	return (0);
+}
diff --git a/sys/ufs/ext2fs/ext2fs_vnops.c b/sys/ufs/ext2fs/ext2fs_vnops.c
new file mode 100644
index 000000000..0ea5dd35c
--- /dev/null
+++ b/sys/ufs/ext2fs/ext2fs_vnops.c
@@ -0,0 +1,1664 @@
+/*	$NetBSD: ext2fs_vnops.c,v 1.101 2011/11/18 21:18:51 christos Exp $	*/
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_vnops.c	8.14 (Berkeley) 10/26/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *	@(#)ufs_vnops.c	8.14 (Berkeley) 10/26/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_vnops.c,v 1.101 2011/11/18 21:18:51 christos Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/lockf.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/signalvar.h>
+#include <sys/kauth.h>
+
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufsmount.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+#include <ufs/ext2fs/ext2fs_dir.h>
+
+extern int prtactive;
+
+static int ext2fs_chmod(struct vnode *, int, kauth_cred_t, struct lwp *);
+static int ext2fs_chown(struct vnode *, uid_t, gid_t, kauth_cred_t,
+				struct lwp *);
+
+union _qcvt {
+	int64_t	qcvt;
+	int32_t val[2];
+};
+
+#define SETHIGH(q, h) { \
+	union _qcvt tmp; \
+	tmp.qcvt = (q); \
+	tmp.val[_QUAD_HIGHWORD] = (h); \
+	(q) = tmp.qcvt; \
+}
+#define SETLOW(q, l) { \
+	union _qcvt tmp; \
+	tmp.qcvt = (q); \
+	tmp.val[_QUAD_LOWWORD] = (l); \
+	(q) = tmp.qcvt; \
+}
+
+/*
+ * Create a regular file
+ */
+int
+ext2fs_create(void *v)
+{
+	struct vop_create_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+		struct vattr *a_vap;
+	} */ *ap = v;
+	int	error;
+
+	error =
+	    ext2fs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
+			     ap->a_dvp, ap->a_vpp, ap->a_cnp);
+
+	if (error)
+		return (error);
+	VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+	return (0);
+}
+
+/*
+ * Mknod vnode call
+ */
+/* ARGSUSED */
+int
+ext2fs_mknod(void *v)
+{
+	struct vop_mknod_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+		struct vattr *a_vap;
+	} */ *ap = v;
+	struct vattr *vap = ap->a_vap;
+	struct vnode **vpp = ap->a_vpp;
+	struct inode *ip;
+	int error;
+	struct mount	*mp;
+	ino_t		ino;
+
+	if ((error = ext2fs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
+		    ap->a_dvp, vpp, ap->a_cnp)) != 0)
+		return (error);
+	VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+	ip = VTOI(*vpp);
+	mp  = (*vpp)->v_mount;
+	ino = ip->i_number;
+	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+	if (vap->va_rdev != VNOVAL) {
+		/*
+		 * Want to be able to use this to make badblock
+		 * inodes, so don't truncate the dev number.
+		 */
+		ip->i_din.e2fs_din->e2di_rdev = h2fs32(vap->va_rdev);
+	}
+	/*
+	 * Remove inode so that it will be reloaded by VFS_VGET and
+	 * checked to see if it is an alias of an existing entry in
+	 * the inode cache.
+	 */
+	VOP_UNLOCK(*vpp);
+	(*vpp)->v_type = VNON;
+	vgone(*vpp);
+	error = VFS_VGET(mp, ino, vpp);
+	if (error != 0) {
+		*vpp = NULL;
+		return (error);
+	}
+	return (0);
+}
+
+/*
+ * Open called.
+ *
+ * Just check the APPEND flag.
+ */
+/* ARGSUSED */
+int
+ext2fs_open(void *v)
+{
+	struct vop_open_args /* {
+		struct vnode *a_vp;
+		int  a_mode;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+
+	/*
+	 * Files marked append-only must be opened for appending.
+	 */
+	if ((VTOI(ap->a_vp)->i_e2fs_flags & EXT2_APPEND) &&
+		(ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
+		return (EPERM);
+	return (0);
+}
+
+static int
+ext2fs_check_possible(struct vnode *vp, struct inode *ip, mode_t mode)
+{
+
+	/*
+	 * Disallow write attempts on read-only file systems;
+	 * unless the file is a socket, fifo, or a block or
+	 * character device resident on the file system.
+	 */
+	if (mode & VWRITE) {
+		switch (vp->v_type) {
+		case VDIR:
+		case VLNK:
+		case VREG:
+			if (vp->v_mount->mnt_flag & MNT_RDONLY)
+				return (EROFS);
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* If immutable bit set, nobody gets to write it. */
+	if ((mode & VWRITE) && (ip->i_e2fs_flags & EXT2_IMMUTABLE))
+		return (EPERM);
+
+	return 0;
+}
+
+static int
+ext2fs_check_permitted(struct vnode *vp, struct inode *ip, mode_t mode,
+    kauth_cred_t cred)
+{
+
+	return genfs_can_access(vp->v_type, ip->i_e2fs_mode & ALLPERMS,
+	    ip->i_uid, ip->i_gid, mode, cred);
+}
+
+int
+ext2fs_access(void *v)
+{
+	struct vop_access_args /* {
+		struct vnode *a_vp;
+		int  a_mode;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	mode_t mode = ap->a_mode;
+	int error;
+
+	error = ext2fs_check_possible(vp, ip, mode);
+	if (error)
+		return error;
+
+	error = ext2fs_check_permitted(vp, ip, mode, ap->a_cred);
+
+	return error;
+}
+
+/* ARGSUSED */
+int
+ext2fs_getattr(void *v)
+{
+	struct vop_getattr_args /* {
+		struct vnode *a_vp;
+		struct vattr *a_vap;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	struct vattr *vap = ap->a_vap;
+
+	EXT2FS_ITIMES(ip, NULL, NULL, NULL);
+	/*
+	 * Copy from inode table
+	 */
+	vap->va_fsid = ip->i_dev;
+	vap->va_fileid = ip->i_number;
+	vap->va_mode = ip->i_e2fs_mode & ALLPERMS;
+	vap->va_nlink = ip->i_e2fs_nlink;
+	vap->va_uid = ip->i_uid;
+	vap->va_gid = ip->i_gid;
+	vap->va_rdev = (dev_t)fs2h32(ip->i_din.e2fs_din->e2di_rdev);
+	vap->va_size = vp->v_size;
+	vap->va_atime.tv_sec = ip->i_e2fs_atime;
+	vap->va_atime.tv_nsec = 0;
+	vap->va_mtime.tv_sec = ip->i_e2fs_mtime;
+	vap->va_mtime.tv_nsec = 0;
+	vap->va_ctime.tv_sec = ip->i_e2fs_ctime;
+	vap->va_ctime.tv_nsec = 0;
+#ifdef EXT2FS_SYSTEM_FLAGS
+	vap->va_flags = (ip->i_e2fs_flags & EXT2_APPEND) ? SF_APPEND : 0;
+	vap->va_flags |= (ip->i_e2fs_flags & EXT2_IMMUTABLE) ? SF_IMMUTABLE : 0;
+#else
+	vap->va_flags = (ip->i_e2fs_flags & EXT2_APPEND) ? UF_APPEND : 0;
+	vap->va_flags |= (ip->i_e2fs_flags & EXT2_IMMUTABLE) ? UF_IMMUTABLE : 0;
+#endif
+	vap->va_gen = ip->i_e2fs_gen;
+	/* this doesn't belong here */
+	if (vp->v_type == VBLK)
+		vap->va_blocksize = BLKDEV_IOSIZE;
+	else if (vp->v_type == VCHR)
+		vap->va_blocksize = MAXBSIZE;
+	else
+		vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+	vap->va_bytes = dbtob((u_quad_t)ip->i_e2fs_nblock);
+	vap->va_type = vp->v_type;
+	vap->va_filerev = ip->i_modrev;
+	return (0);
+}
+
+/*
+ * Set attribute vnode op. called from several syscalls
+ */
+int
+ext2fs_setattr(void *v)
+{
+	struct vop_setattr_args /* {
+		struct vnode *a_vp;
+		struct vattr *a_vap;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+	struct vattr *vap = ap->a_vap;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	kauth_cred_t cred = ap->a_cred;
+	struct lwp *l = curlwp;
+	int error;
+
+	/*
+	 * Check for unsettable attributes.
+	 */
+	if ((vap->va_type != VNON) || (vap->va_nlink != (nlink_t)VNOVAL) ||
+	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
+	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
+	    ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
+		return (EINVAL);
+	}
+	if (vap->va_flags != VNOVAL) {
+		if (vp->v_mount->mnt_flag & MNT_RDONLY)
+			return (EROFS);
+		if (kauth_cred_geteuid(cred) != ip->i_uid &&
+		    (error = kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+		    NULL)))
+			return (error);
+#ifdef EXT2FS_SYSTEM_FLAGS
+		if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+		    NULL) == 0) {
+			if ((ip->i_e2fs_flags &
+			    (EXT2_APPEND | EXT2_IMMUTABLE)) &&
+			    kauth_authorize_system(l->l_cred,
+			     KAUTH_SYSTEM_CHSYSFLAGS, 0, NULL, NULL, NULL))
+				return (EPERM);
+			ip->i_e2fs_flags &= ~(EXT2_APPEND | EXT2_IMMUTABLE);
+			ip->i_e2fs_flags |=
+			    (vap->va_flags & SF_APPEND) ?  EXT2_APPEND : 0 |
+			    (vap->va_flags & SF_IMMUTABLE) ? EXT2_IMMUTABLE : 0;
+		} else
+			return (EPERM);
+#else
+		ip->i_e2fs_flags &= ~(EXT2_APPEND | EXT2_IMMUTABLE);
+		ip->i_e2fs_flags |=
+		    (vap->va_flags & UF_APPEND) ? EXT2_APPEND : 0 |
+		    (vap->va_flags & UF_IMMUTABLE) ? EXT2_IMMUTABLE : 0;
+#endif
+		ip->i_flag |= IN_CHANGE;
+		if (vap->va_flags & (IMMUTABLE | APPEND))
+			return (0);
+	}
+	if (ip->i_e2fs_flags & (EXT2_APPEND | EXT2_IMMUTABLE))
+		return (EPERM);
+	/*
+	 * Go through the fields and update iff not VNOVAL.
+	 */
+	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
+		if (vp->v_mount->mnt_flag & MNT_RDONLY)
+			return (EROFS);
+		error = ext2fs_chown(vp, vap->va_uid, vap->va_gid, cred, l);
+		if (error)
+			return (error);
+	}
+	if (vap->va_size != VNOVAL) {
+		/*
+		 * Disallow write attempts on read-only file systems;
+		 * unless the file is a socket, fifo, or a block or
+		 * character device resident on the file system.
+		 */
+		switch (vp->v_type) {
+		case VDIR:
+			return (EISDIR);
+		case VLNK:
+		case VREG:
+			if (vp->v_mount->mnt_flag & MNT_RDONLY)
+				return (EROFS);
+		default:
+			break;
+		}
+		error = ext2fs_truncate(vp, vap->va_size, 0, cred);
+		if (error)
+			return (error);
+	}
+	ip = VTOI(vp);
+	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
+		if (vp->v_mount->mnt_flag & MNT_RDONLY)
+			return (EROFS);
+		error = genfs_can_chtimes(vp, vap->va_vaflags, ip->i_uid, cred);
+		if (error)
+			return (error);
+		if (vap->va_atime.tv_sec != VNOVAL)
+			if (!(vp->v_mount->mnt_flag & MNT_NOATIME))
+				ip->i_flag |= IN_ACCESS;
+		if (vap->va_mtime.tv_sec != VNOVAL) {
+			ip->i_flag |= IN_CHANGE | IN_UPDATE;
+			if (vp->v_mount->mnt_flag & MNT_RELATIME)
+				ip->i_flag |= IN_ACCESS;
+		}
+		error = ext2fs_update(vp, &vap->va_atime, &vap->va_mtime,
+			UPDATE_WAIT);
+		if (error)
+			return (error);
+	}
+	error = 0;
+	if (vap->va_mode != (mode_t)VNOVAL) {
+		if (vp->v_mount->mnt_flag & MNT_RDONLY)
+			return (EROFS);
+		error = ext2fs_chmod(vp, (int)vap->va_mode, cred, l);
+	}
+	VN_KNOTE(vp, NOTE_ATTRIB);
+	return (error);
+}
+
+/*
+ * Change the mode on a file.
+ * Inode must be locked before calling.
+ */
+static int
+ext2fs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l)
+{
+	struct inode *ip = VTOI(vp);
+	int error;
+
+	error = genfs_can_chmod(vp, cred, ip->i_uid, ip->i_gid, mode);
+	if (error)
+		return (error);
+
+	ip->i_e2fs_mode &= ~ALLPERMS;
+	ip->i_e2fs_mode |= (mode & ALLPERMS);
+	ip->i_flag |= IN_CHANGE;
+	return (0);
+}
+
+/*
+ * Perform chown operation on inode ip;
+ * inode must be locked prior to call.
+ */
+static int
+ext2fs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred,
+		struct lwp *l)
+{
+	struct inode *ip = VTOI(vp);
+	uid_t ouid;
+	gid_t ogid;
+	int error;
+
+	if (uid == (uid_t)VNOVAL)
+		uid = ip->i_uid;
+	if (gid == (gid_t)VNOVAL)
+		gid = ip->i_gid;
+
+	error = genfs_can_chown(vp, cred, ip->i_uid, ip->i_gid, uid, gid);
+	if (error)
+		return (error);
+
+	ogid = ip->i_gid;
+	ouid = ip->i_uid;
+
+	ip->i_e2fs_gid = gid & 0xffff;
+	ip->i_e2fs_uid = uid & 0xffff;
+	if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) {
+		ip->i_e2fs_gid_high = (gid >> 16) & 0xffff;
+		ip->i_e2fs_uid_high = (uid >> 16) & 0xffff;
+	} else {
+		ip->i_e2fs_gid_high = 0;
+		ip->i_e2fs_uid_high = 0;
+	}
+	if (ouid != uid || ogid != gid) {
+		ext2fs_set_inode_guid(ip);
+		ip->i_flag |= IN_CHANGE;
+	}
+	if (ouid != uid && kauth_authorize_generic(cred,
+	    KAUTH_GENERIC_ISSUSER, NULL) != 0)
+		ip->i_e2fs_mode &= ~ISUID;
+	if (ogid != gid && kauth_authorize_generic(cred,
+	    KAUTH_GENERIC_ISSUSER, NULL) != 0)
+		ip->i_e2fs_mode &= ~ISGID;
+	return (0);
+}
+
+int
+ext2fs_remove(void *v)
+{
+	struct vop_remove_args /* {
+		struct vnode *a_dvp;
+		struct vnode *a_vp;
+		struct componentname *a_cnp;
+	} */ *ap = v;
+	struct inode *ip;
+	struct vnode *vp = ap->a_vp;
+	struct vnode *dvp = ap->a_dvp;
+	struct ufs_lookup_results *ulr;
+	int error;
+
+	/* XXX should handle this material another way */
+	ulr = &VTOI(dvp)->i_crap;
+	UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+	ip = VTOI(vp);
+	if (vp->v_type == VDIR ||
+		(ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) ||
+		(VTOI(dvp)->i_e2fs_flags & EXT2_APPEND)) {
+		error = EPERM;
+	} else {
+		error = ext2fs_dirremove(dvp, ulr, ap->a_cnp);
+		if (error == 0) {
+			ip->i_e2fs_nlink--;
+			ip->i_flag |= IN_CHANGE;
+		}
+	}
+
+	VN_KNOTE(vp, NOTE_DELETE);
+	VN_KNOTE(dvp, NOTE_WRITE);
+	if (dvp == vp)
+		vrele(vp);
+	else
+		vput(vp);
+	vput(dvp);
+	return (error);
+}
+
+/*
+ * ext2fs_link: create hard link.
+ */
+int
+ext2fs_link(void *v)
+{
+	struct vop_link_args /* {
+		struct vnode *a_dvp;
+		struct vnode *a_vp;
+		struct componentname *a_cnp;
+	} */ *ap = v;
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode *vp = ap->a_vp;
+	struct componentname *cnp = ap->a_cnp;
+	struct inode *ip;
+	int error;
+	struct ufs_lookup_results *ulr;
+
+	KASSERT(dvp != vp);
+	KASSERT(vp->v_type != VDIR);
+	KASSERT(dvp->v_mount == vp->v_mount);
+
+	/* XXX should handle this material another way */
+	ulr = &VTOI(dvp)->i_crap;
+	UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+	error = vn_lock(vp, LK_EXCLUSIVE);
+	if (error) {
+		VOP_ABORTOP(dvp, cnp);
+		goto out2;
+	}
+	ip = VTOI(vp);
+	if ((nlink_t)ip->i_e2fs_nlink >= LINK_MAX) {
+		VOP_ABORTOP(dvp, cnp);
+		error = EMLINK;
+		goto out1;
+	}
+	if (ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) {
+		VOP_ABORTOP(dvp, cnp);
+		error = EPERM;
+		goto out1;
+	}
+	ip->i_e2fs_nlink++;
+	ip->i_flag |= IN_CHANGE;
+	error = ext2fs_update(vp, NULL, NULL, UPDATE_WAIT);
+	if (!error)
+		error = ext2fs_direnter(ip, dvp, ulr, cnp);
+	if (error) {
+		ip->i_e2fs_nlink--;
+		ip->i_flag |= IN_CHANGE;
+	}
+out1:
+	VOP_UNLOCK(vp);
+out2:
+	VN_KNOTE(vp, NOTE_LINK);
+	VN_KNOTE(dvp, NOTE_WRITE);
+	vput(dvp);
+	return (error);
+}
+
+/*
+ * Rename system call.
+ *	rename("foo", "bar");
+ * is essentially
+ *	unlink("bar");
+ *	link("foo", "bar");
+ *	unlink("foo");
+ * but ``atomically''.  Can't do full commit without saving state in the
+ * inode on disk which isn't feasible at this time.  Best we can do is
+ * always guarantee the target exists.
+ *
+ * Basic algorithm is:
+ *
+ * 1) Bump link count on source while we're linking it to the
+ *    target.  This also ensure the inode won't be deleted out
+ *    from underneath us while we work (it may be truncated by
+ *    a concurrent `trunc' or `open' for creation).
+ * 2) Link source to destination.  If destination already exists,
+ *    delete it first.
+ * 3) Unlink source reference to inode if still around. If a
+ *    directory was moved and the parent of the destination
+ *    is different from the source, patch the ".." entry in the
+ *    directory.
+ */
+int
+ext2fs_rename(void *v)
+{
+	struct vop_rename_args  /* {
+		struct vnode *a_fdvp;
+		struct vnode *a_fvp;
+		struct componentname *a_fcnp;
+		struct vnode *a_tdvp;
+		struct vnode *a_tvp;
+		struct componentname *a_tcnp;
+	} */ *ap = v;
+	struct vnode *tvp = ap->a_tvp;
+	struct vnode *tdvp = ap->a_tdvp;
+	struct vnode *fvp = ap->a_fvp;
+	struct vnode *fdvp = ap->a_fdvp;
+	struct componentname *tcnp = ap->a_tcnp;
+	struct componentname *fcnp = ap->a_fcnp;
+	struct inode *ip, *xp, *dp;
+	struct ext2fs_dirtemplate dirbuf;
+	int doingdirectory = 0, oldparent = 0, newparent = 0;
+	int error = 0;
+	u_char namlen;
+
+	/*
+	 * Check for cross-device rename.
+	 */
+	if ((fvp->v_mount != tdvp->v_mount) ||
+	    (tvp && (fvp->v_mount != tvp->v_mount))) {
+		error = EXDEV;
+abortit:
+		VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */
+		if (tdvp == tvp)
+			vrele(tdvp);
+		else
+			vput(tdvp);
+		if (tvp)
+			vput(tvp);
+		VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */
+		vrele(fdvp);
+		vrele(fvp);
+		return (error);
+	}
+
+	/*
+	 * Check if just deleting a link name.
+	 */
+	if (tvp && ((VTOI(tvp)->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) ||
+	    (VTOI(tdvp)->i_e2fs_flags & EXT2_APPEND))) {
+		error = EPERM;
+		goto abortit;
+	}
+	if (fvp == tvp) {
+		if (fvp->v_type == VDIR) {
+			error = EINVAL;
+			goto abortit;
+		}
+
+		/* Release destination completely. */
+		VOP_ABORTOP(tdvp, tcnp);
+		vput(tdvp);
+		vput(tvp);
+
+		/* Delete source. */
+		vrele(fvp);
+		fcnp->cn_flags &= ~(MODMASK);
+		fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+		fcnp->cn_nameiop = DELETE;
+		vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+		if ((error = relookup(fdvp, &fvp, fcnp, 0))) {
+			vput(fdvp);
+			return (error);
+		}
+		return (VOP_REMOVE(fdvp, fvp, fcnp));
+	}
+	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
+		goto abortit;
+	dp = VTOI(fdvp);
+	ip = VTOI(fvp);
+	if ((nlink_t) ip->i_e2fs_nlink >= LINK_MAX) {
+		VOP_UNLOCK(fvp);
+		error = EMLINK;
+		goto abortit;
+	}
+	if ((ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) ||
+		(dp->i_e2fs_flags & EXT2_APPEND)) {
+		VOP_UNLOCK(fvp);
+		error = EPERM;
+		goto abortit;
+	}
+	if ((ip->i_e2fs_mode & IFMT) == IFDIR) {
+		error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred);
+		if (!error && tvp)
+			error = VOP_ACCESS(tvp, VWRITE, tcnp->cn_cred);
+		if (error) {
+			VOP_UNLOCK(fvp);
+			error = EACCES;
+			goto abortit;
+		}
+		/*
+		 * Avoid ".", "..", and aliases of "." for obvious reasons.
+		 */
+		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
+		    dp == ip ||
+		    (fcnp->cn_flags & ISDOTDOT) ||
+		    (tcnp->cn_flags & ISDOTDOT) ||
+		    (ip->i_flag & IN_RENAME)) {
+			VOP_UNLOCK(fvp);
+			error = EINVAL;
+			goto abortit;
+		}
+		ip->i_flag |= IN_RENAME;
+		oldparent = dp->i_number;
+		doingdirectory = 1;
+	}
+	VN_KNOTE(fdvp, NOTE_WRITE);		/* XXXLUKEM/XXX: right place? */
+
+	/*
+	 * When the target exists, both the directory
+	 * and target vnodes are returned locked.
+	 */
+	dp = VTOI(tdvp);
+	xp = NULL;
+	if (tvp)
+		xp = VTOI(tvp);
+
+	/*
+	 * 1) Bump link count while we're moving stuff
+	 *    around.  If we crash somewhere before
+	 *    completing our work, the link count
+	 *    may be wrong, but correctable.
+	 */
+	ip->i_e2fs_nlink++;
+	ip->i_flag |= IN_CHANGE;
+	if ((error = ext2fs_update(fvp, NULL, NULL, UPDATE_WAIT)) != 0) {
+		VOP_UNLOCK(fvp);
+		goto bad;
+	}
+
+	/*
+	 * If ".." must be changed (ie the directory gets a new
+	 * parent) then the source directory must not be in the
+	 * directory hierarchy above the target, as this would
+	 * orphan everything below the source directory. Also
+	 * the user must have write permission in the source so
+	 * as to be able to change "..". We must repeat the call
+	 * to namei, as the parent directory is unlocked by the
+	 * call to checkpath().
+	 */
+	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred);
+	VOP_UNLOCK(fvp);
+	if (oldparent != dp->i_number)
+		newparent = dp->i_number;
+	if (doingdirectory && newparent) {
+		if (error)	/* write access check above */
+			goto bad;
+		if (xp != NULL)
+			vput(tvp);
+		vref(tdvp);     /* compensate for the ref checkpath loses */
+		error = ext2fs_checkpath(ip, dp, tcnp->cn_cred);
+		if (error != 0) {
+			vrele(tdvp);
+			goto out;
+		}
+		vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
+		if ((error = relookup(tdvp, &tvp, tcnp, 0)) != 0) {
+			vput(tdvp);
+			goto out;
+		}
+		dp = VTOI(tdvp);
+		xp = NULL;
+		if (tvp)
+			xp = VTOI(tvp);
+	}
+	/*
+	 * 2) If target doesn't exist, link the target
+	 *    to the source and unlink the source.
+	 *    Otherwise, rewrite the target directory
+	 *    entry to reference the source inode and
+	 *    expunge the original entry's existence.
+	 */
+	if (xp == NULL) {
+		if (dp->i_dev != ip->i_dev)
+			panic("rename: EXDEV");
+		/*
+		 * Account for ".." in new directory.
+		 * When source and destination have the same
+		 * parent we don't fool with the link count.
+		 */
+		if (doingdirectory && newparent) {
+			if ((nlink_t)dp->i_e2fs_nlink >= LINK_MAX) {
+				error = EMLINK;
+				goto bad;
+			}
+			dp->i_e2fs_nlink++;
+			dp->i_flag |= IN_CHANGE;
+			if ((error = ext2fs_update(tdvp, NULL, NULL,
+			    UPDATE_WAIT)) != 0)
+				goto bad;
+		}
+		error = ext2fs_direnter(ip, tdvp, &VTOI(tdvp)->i_crap, tcnp);
+		if (error != 0) {
+			if (doingdirectory && newparent) {
+				dp->i_e2fs_nlink--;
+				dp->i_flag |= IN_CHANGE;
+				(void)ext2fs_update(tdvp, NULL, NULL,
+				    UPDATE_WAIT);
+			}
+			goto bad;
+		}
+		VN_KNOTE(tdvp, NOTE_WRITE);
+		vput(tdvp);
+	} else {
+		if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev)
+			panic("rename: EXDEV");
+		/*
+		 * Short circuit rename(foo, foo).
+		 */
+		if (xp->i_number == ip->i_number)
+			panic("rename: same file");
+		/*
+		 * If the parent directory is "sticky", then the user must
+		 * own the parent directory, or the destination of the rename,
+		 * otherwise the destination may not be changed (except by
+		 * root). This implements append-only directories.
+		 */
+		if ((dp->i_e2fs_mode & S_ISTXT) &&
+		    kauth_authorize_generic(tcnp->cn_cred,
+		     KAUTH_GENERIC_ISSUSER, NULL) != 0 &&
+		    kauth_cred_geteuid(tcnp->cn_cred) != dp->i_uid &&
+		    xp->i_uid != kauth_cred_geteuid(tcnp->cn_cred)) {
+			error = EPERM;
+			goto bad;
+		}
+		/*
+		 * Target must be empty if a directory and have no links
+		 * to it. Also, ensure source and target are compatible
+		 * (both directories, or both not directories).
+		 */
+		if ((xp->i_e2fs_mode & IFMT) == IFDIR) {
+			if (!ext2fs_dirempty(xp, dp->i_number, tcnp->cn_cred) ||
+				xp->i_e2fs_nlink > 2) {
+				error = ENOTEMPTY;
+				goto bad;
+			}
+			if (!doingdirectory) {
+				error = ENOTDIR;
+				goto bad;
+			}
+			cache_purge(tdvp);
+		} else if (doingdirectory) {
+			error = EISDIR;
+			goto bad;
+		}
+		error = ext2fs_dirrewrite(dp, &dp->i_crap, ip, tcnp);
+		if (error != 0)
+			goto bad;
+		/*
+		 * If the target directory is in the same
+		 * directory as the source directory,
+		 * decrement the link count on the parent
+		 * of the target directory.
+		 */
+		 if (doingdirectory && !newparent) {
+			dp->i_e2fs_nlink--;
+			dp->i_flag |= IN_CHANGE;
+		}
+		/*
+		 * Adjust the link count of the target to
+		 * reflect the dirrewrite above.  If this is
+		 * a directory it is empty and there are
+		 * no links to it, so we can squash the inode and
+		 * any space associated with it.  We disallowed
+		 * renaming over top of a directory with links to
+		 * it above, as the remaining link would point to
+		 * a directory without "." or ".." entries.
+		 */
+		xp->i_e2fs_nlink--;
+		if (doingdirectory) {
+			if (--xp->i_e2fs_nlink != 0)
+				panic("rename: linked directory");
+			error = ext2fs_truncate(tvp, (off_t)0, IO_SYNC,
+			    tcnp->cn_cred);
+		}
+		xp->i_flag |= IN_CHANGE;
+		VN_KNOTE(tdvp, NOTE_WRITE);
+		vput(tdvp);
+		VN_KNOTE(tvp, NOTE_DELETE);
+		vput(tvp);
+		xp = NULL;
+	}
+
+	/*
+	 * 3) Unlink the source.
+	 */
+	fcnp->cn_flags &= ~(MODMASK);
+	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+	vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+	if ((error = relookup(fdvp, &fvp, fcnp, 0))) {
+		vput(fdvp);
+		vrele(ap->a_fvp);
+		return (error);
+	}
+	if (fvp != NULL) {
+		xp = VTOI(fvp);
+		dp = VTOI(fdvp);
+	} else {
+		/*
+		 * From name has disappeared.
+		 */
+		if (doingdirectory)
+			panic("ext2fs_rename: lost dir entry");
+		vrele(ap->a_fvp);
+		return (0);
+	}
+	/*
+	 * Ensure that the directory entry still exists and has not
+	 * changed while the new name has been entered. If the source is
+	 * a file then the entry may have been unlinked or renamed. In
+	 * either case there is no further work to be done. If the source
+	 * is a directory then it cannot have been rmdir'ed; its link
+	 * count of three would cause a rmdir to fail with ENOTEMPTY.
+	 * The IRENAME flag ensures that it cannot be moved by another
+	 * rename.
+	 */
+	if (xp != ip) {
+		if (doingdirectory)
+			panic("ext2fs_rename: lost dir entry");
+	} else {
+		/*
+		 * If the source is a directory with a
+		 * new parent, the link count of the old
+		 * parent directory must be decremented
+		 * and ".." set to point to the new parent.
+		 */
+		if (doingdirectory && newparent) {
+			KASSERT(dp != NULL);
+			dp->i_e2fs_nlink--;
+			dp->i_flag |= IN_CHANGE;
+			error = vn_rdwr(UIO_READ, fvp, (void *)&dirbuf,
+				sizeof (struct ext2fs_dirtemplate), (off_t)0,
+				UIO_SYSSPACE, IO_NODELOCKED,
+				tcnp->cn_cred, (size_t *)0, NULL);
+			if (error == 0) {
+					namlen = dirbuf.dotdot_namlen;
+				if (namlen != 2 ||
+				    dirbuf.dotdot_name[0] != '.' ||
+				    dirbuf.dotdot_name[1] != '.') {
+					ufs_dirbad(xp, (doff_t)12,
+					    "ext2fs_rename: mangled dir");
+				} else {
+					dirbuf.dotdot_ino = h2fs32(newparent);
+					(void) vn_rdwr(UIO_WRITE, fvp,
+					    (void *)&dirbuf,
+					    sizeof (struct dirtemplate),
+					    (off_t)0, UIO_SYSSPACE,
+					    IO_NODELOCKED|IO_SYNC,
+					    tcnp->cn_cred, (size_t *)0,
+					    NULL);
+					cache_purge(fdvp);
+				}
+			}
+		}
+		error = ext2fs_dirremove(fdvp, &VTOI(fdvp)->i_crap, fcnp);
+		if (!error) {
+			xp->i_e2fs_nlink--;
+			xp->i_flag |= IN_CHANGE;
+		}
+		xp->i_flag &= ~IN_RENAME;
+	}
+	VN_KNOTE(fvp, NOTE_RENAME);
+	if (dp)
+		vput(fdvp);
+	if (xp)
+		vput(fvp);
+	vrele(ap->a_fvp);
+	return (error);
+
+bad:
+	if (xp)
+		vput(ITOV(xp));
+	vput(ITOV(dp));
+out:
+	if (doingdirectory)
+		ip->i_flag &= ~IN_RENAME;
+	if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
+		ip->i_e2fs_nlink--;
+		ip->i_flag |= IN_CHANGE;
+		vput(fvp);
+	} else
+		vrele(fvp);
+	vrele(fdvp);
+	return (error);
+}
+
+/*
+ * Mkdir system call
+ */
+int
+ext2fs_mkdir(void *v)
+{
+	struct vop_mkdir_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+		struct vattr *a_vap;
+	} */ *ap = v;
+	struct vnode		*dvp = ap->a_dvp;
+	struct vattr		*vap = ap->a_vap;
+	struct componentname	*cnp = ap->a_cnp;
+	struct inode		*ip, *dp = VTOI(dvp);
+	struct vnode		*tvp;
+	struct ext2fs_dirtemplate dirtemplate;
+	int			error, dmode;
+	struct ufs_lookup_results *ulr;
+
+	/* XXX should handle this material another way */
+	ulr = &VTOI(dvp)->i_crap;
+	UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+	if ((nlink_t)dp->i_e2fs_nlink >= LINK_MAX) {
+		error = EMLINK;
+		goto out;
+	}
+	dmode = vap->va_mode & ACCESSPERMS;
+	dmode |= IFDIR;
+	/*
+	 * Must simulate part of ext2fs_makeinode here to acquire the inode,
+	 * but not have it entered in the parent directory. The entry is
+	 * made later after writing "." and ".." entries.
+	 */
+	if ((error = ext2fs_valloc(dvp, dmode, cnp->cn_cred, &tvp)) != 0)
+		goto out;
+	ip = VTOI(tvp);
+	ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
+	ip->i_e2fs_uid = ip->i_uid & 0xffff;
+	ip->i_e2fs_gid = dp->i_e2fs_gid;
+	if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) {
+		ip->i_e2fs_uid_high = (ip->i_uid >> 16) & 0xffff;
+		ip->i_e2fs_gid_high = dp->i_e2fs_gid_high;
+	} else {
+		ip->i_e2fs_uid_high = 0;
+		ip->i_e2fs_gid_high = 0;
+	}
+	ip->i_gid = ip->i_e2fs_gid | (ip->i_e2fs_gid_high << 16);
+	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+	ip->i_e2fs_mode = dmode;
+	tvp->v_type = VDIR;	/* Rest init'd in getnewvnode(). */
+	ip->i_e2fs_nlink = 2;
+
+	/*
+	 * Bump link count in parent directory
+	 * to reflect work done below.  Should
+	 * be done before reference is created
+	 * so reparation is possible if we crash.
+	 */
+	dp->i_e2fs_nlink++;
+	dp->i_flag |= IN_CHANGE;
+	if ((error = ext2fs_update(dvp, NULL, NULL, UPDATE_DIROP)) != 0)
+		goto bad;
+
+	/* Initialize directory with "." and ".." from static template. */
+	memset(&dirtemplate, 0, sizeof(dirtemplate));
+	dirtemplate.dot_ino = h2fs32(ip->i_number);
+	dirtemplate.dot_reclen = h2fs16(12);
+	dirtemplate.dot_namlen = 1;
+	if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 &&
+	    (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) {
+		dirtemplate.dot_type = EXT2_FT_DIR;
+	}
+	dirtemplate.dot_name[0] = '.';
+	dirtemplate.dotdot_ino = h2fs32(dp->i_number);
+    dirtemplate.dotdot_reclen = h2fs16(VTOI(dvp)->i_e2fs->e2fs_bsize - 12);
+	dirtemplate.dotdot_namlen = 2;
+	if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 &&
+	    (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) {
+		dirtemplate.dotdot_type = EXT2_FT_DIR;
+	}
+	dirtemplate.dotdot_name[0] = dirtemplate.dotdot_name[1] = '.';
+	error = vn_rdwr(UIO_WRITE, tvp, (void *)&dirtemplate,
+	    sizeof (dirtemplate), (off_t)0, UIO_SYSSPACE,
+	    IO_NODELOCKED|IO_SYNC, cnp->cn_cred, (size_t *)0, NULL);
+	if (error) {
+		dp->i_e2fs_nlink--;
+		dp->i_flag |= IN_CHANGE;
+		goto bad;
+	}
+	if (VTOI(dvp)->i_e2fs->e2fs_bsize > dvp->v_mount->mnt_stat.f_bsize)
+		panic("ext2fs_mkdir: blksize"); /* XXX should grow with balloc() */
+	else {
+		error = ext2fs_setsize(ip, VTOI(dvp)->i_e2fs->e2fs_bsize);
+		if (error) {
+			dp->i_e2fs_nlink--;
+			dp->i_flag |= IN_CHANGE;
+			goto bad;
+		}
+		ip->i_flag |= IN_CHANGE;
+		uvm_vnp_setsize(tvp, ext2fs_size(ip));
+	}
+
+	/* Directory set up, now install it's entry in the parent directory. */
+	error = ext2fs_direnter(ip, dvp, ulr, cnp);
+	if (error != 0) {
+		dp->i_e2fs_nlink--;
+		dp->i_flag |= IN_CHANGE;
+	}
+bad:
+	/*
+	 * No need to do an explicit ext2fs_truncate here, vrele will do this
+	 * for us because we set the link count to 0.
+	 */
+	if (error) {
+		ip->i_e2fs_nlink = 0;
+		ip->i_flag |= IN_CHANGE;
+		vput(tvp);
+	} else {
+		VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+		*ap->a_vpp = tvp;
+	}
+out:
+	vput(dvp);
+	return (error);
+}
+
+/*
+ * Rmdir system call.
+ */
+int
+ext2fs_rmdir(void *v)
+{
+	struct vop_rmdir_args /* {
+		struct vnode *a_dvp;
+		struct vnode *a_vp;
+		struct componentname *a_cnp;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct vnode *dvp = ap->a_dvp;
+	struct componentname *cnp = ap->a_cnp;
+	struct inode *ip, *dp;
+	int error;
+	struct ufs_lookup_results *ulr;
+
+	ip = VTOI(vp);
+	dp = VTOI(dvp);
+
+	/* XXX should handle this material another way */
+	ulr = &dp->i_crap;
+	UFS_CHECK_CRAPCOUNTER(dp);
+
+	/*
+	 * No rmdir "." please.
+	 */
+	if (dp == ip) {
+		vrele(dvp);
+		vput(vp);
+		return (EINVAL);
+	}
+	/*
+	 * Verify the directory is empty (and valid).
+	 * (Rmdir ".." won't be valid since
+	 *  ".." will contain a reference to
+	 *  the current directory and thus be
+	 *  non-empty.)
+	 */
+	error = 0;
+	if (ip->i_e2fs_nlink != 2 ||
+	    !ext2fs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
+		error = ENOTEMPTY;
+		goto out;
+	}
+	if ((dp->i_e2fs_flags & EXT2_APPEND) ||
+				 (ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND))) {
+		error = EPERM;
+		goto out;
+	}
+	/*
+	 * Delete reference to directory before purging
+	 * inode.  If we crash in between, the directory
+	 * will be reattached to lost+found,
+	 */
+	error = ext2fs_dirremove(dvp, ulr, cnp);
+	if (error != 0)
+		goto out;
+	dp->i_e2fs_nlink--;
+	dp->i_flag |= IN_CHANGE;
+	VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+	cache_purge(dvp);
+	vput(dvp);
+	dvp = NULL;
+	/*
+	 * Truncate inode.  The only stuff left
+	 * in the directory is "." and "..".  The
+	 * "." reference is inconsequential since
+	 * we're quashing it.  The ".." reference
+	 * has already been adjusted above.  We've
+	 * removed the "." reference and the reference
+	 * in the parent directory, but there may be
+	 * other hard links so decrement by 2 and
+	 * worry about them later.
+	 */
+	ip->i_e2fs_nlink -= 2;
+	error = ext2fs_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred);
+	cache_purge(ITOV(ip));
+out:
+	VN_KNOTE(vp, NOTE_DELETE);
+	if (dvp)
+		vput(dvp);
+	vput(vp);
+	return (error);
+}
+
+/*
+ * symlink -- make a symbolic link
+ */
+int
+ext2fs_symlink(void *v)
+{
+	struct vop_symlink_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+		struct vattr *a_vap;
+		char *a_target;
+	} */ *ap = v;
+	struct vnode	*vp, **vpp;
+	struct inode	*ip;
+	int		len, error;
+
+	vpp = ap->a_vpp;
+	error = ext2fs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
+			      vpp, ap->a_cnp);
+	if (error)
+		return (error);
+	VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+	vp = *vpp;
+	len = strlen(ap->a_target);
+	ip = VTOI(vp);
+	if (len < ip->i_ump->um_maxsymlinklen) {
+		memcpy((char *)ip->i_din.e2fs_din->e2di_shortlink, ap->a_target, len);
+		error = ext2fs_setsize(ip, len);
+		if (error)
+			goto bad;
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		if (vp->v_mount->mnt_flag & MNT_RELATIME)
+			ip->i_flag |= IN_ACCESS;
+		uvm_vnp_setsize(vp, len);
+	} else
+		error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0,
+		    UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred,
+		    (size_t *)0, NULL);
+bad:
+	if (error)
+		vput(vp);
+	return (error);
+}
+
+/*
+ * Return target name of a symbolic link
+ */
+int
+ext2fs_readlink(void *v)
+{
+	struct vop_readlink_args /* {
+		struct vnode *a_vp;
+		struct uio *a_uio;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+	struct vnode	*vp = ap->a_vp;
+	struct inode	*ip = VTOI(vp);
+	struct ufsmount	*ump = ip->i_ump;
+	int		isize;
+
+	isize = ext2fs_size(ip);
+	if (isize < ump->um_maxsymlinklen ||
+	    (ump->um_maxsymlinklen == 0 && ip->i_e2fs_nblock == 0)) {
+		uiomove((char *)ip->i_din.e2fs_din->e2di_shortlink, isize, ap->a_uio);
+		return (0);
+	}
+	return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
+}
+
+/*
+ * Advisory record locking support
+ */
+int
+ext2fs_advlock(void *v)
+{
+	struct vop_advlock_args /* {
+		struct vnode *a_vp;
+		void * a_id;
+		int  a_op;
+		struct flock *a_fl;
+		int  a_flags;
+	} */ *ap = v;
+	struct inode *ip = VTOI(ap->a_vp);
+
+	return lf_advlock(ap, &ip->i_lockf, ext2fs_size(ip));
+}
+
+int
+ext2fs_fsync(void *v)
+{
+	struct vop_fsync_args /* {
+		struct vnode *a_vp;
+		kauth_cred_t a_cred;
+		int a_flags;
+		off_t offlo;
+		off_t offhi;
+		struct proc *a_p;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	int wait;
+	int error;
+
+	wait = (ap->a_flags & FSYNC_WAIT) != 0;
+
+	if (vp->v_type == VBLK)
+		error = spec_fsync(v);
+	else
+		error = vflushbuf(vp, wait);
+	if (error == 0 && (ap->a_flags & FSYNC_DATAONLY) == 0)
+		error = ext2fs_update(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
+
+	if (error == 0 && ap->a_flags & FSYNC_CACHE) {
+		int l = 0;
+		error = VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE,
+		    curlwp->l_cred);
+	}
+
+	return error;
+}
+
+/*
+ * Initialize the vnode associated with a new inode, handle aliased
+ * vnodes.
+ */
+int
+ext2fs_vinit(struct mount *mntp, int (**specops)(void *),
+	int (**fifoops)(void *), struct vnode **vpp)
+{
+	struct timeval tv;
+	struct inode *ip;
+	struct vnode *vp;
+
+	vp = *vpp;
+	ip = VTOI(vp);
+	switch(vp->v_type = IFTOVT(ip->i_e2fs_mode)) {
+	case VCHR:
+	case VBLK:
+		vp->v_op = specops;
+		spec_node_init(vp, fs2h32(ip->i_din.e2fs_din->e2di_rdev));
+		break;
+	case VFIFO:
+		vp->v_op = fifoops;
+		break;
+	case VNON:
+	case VBAD:
+	case VSOCK:
+	case VLNK:
+	case VDIR:
+	case VREG:
+		break;
+	}
+	if (ip->i_number == ROOTINO)
+                vp->v_vflag |= VV_ROOT;
+	/*
+	 * Initialize modrev times
+	 */
+	getmicrouptime(&tv);
+	SETHIGH(ip->i_modrev, tv.tv_sec);
+	SETLOW(ip->i_modrev, tv.tv_usec * 4294);
+	*vpp = vp;
+	return (0);
+}
+
+/*
+ * Allocate a new inode.
+ */
+int
+ext2fs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
+		struct componentname *cnp)
+{
+	struct inode *ip, *pdir;
+	struct vnode *tvp;
+	int error, ismember = 0;
+	struct ufs_lookup_results *ulr;
+
+	pdir = VTOI(dvp);
+
+	/* XXX should handle this material another way */
+	ulr = &pdir->i_crap;
+	UFS_CHECK_CRAPCOUNTER(pdir);
+
+	*vpp = NULL;
+	if ((mode & IFMT) == 0)
+		mode |= IFREG;
+
+	if ((error = ext2fs_valloc(dvp, mode, cnp->cn_cred, &tvp)) != 0) {
+		vput(dvp);
+		return (error);
+	}
+	ip = VTOI(tvp);
+	ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
+	ip->i_e2fs_uid = ip->i_uid & 0xffff;
+	ip->i_e2fs_gid = pdir->i_e2fs_gid;
+	if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) {
+		ip->i_e2fs_uid_high = (ip->i_uid >> 16) & 0xffff;
+		ip->i_e2fs_gid_high = pdir->i_e2fs_gid_high;
+	} else {
+		ip->i_e2fs_uid_high = 0;
+		ip->i_e2fs_gid_high = 0;
+	}
+	ip->i_gid = ip->i_e2fs_gid | (ip->i_e2fs_gid_high << 16);
+	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+	ip->i_e2fs_mode = mode;
+	tvp->v_type = IFTOVT(mode);	/* Rest init'd in getnewvnode(). */
+	ip->i_e2fs_nlink = 1;
+	if ((ip->i_e2fs_mode & ISGID) && (kauth_cred_ismember_gid(cnp->cn_cred,
+	    ip->i_gid, &ismember) != 0 || !ismember) &&
+	    kauth_authorize_generic(cnp->cn_cred, KAUTH_GENERIC_ISSUSER, NULL))
+		ip->i_e2fs_mode &= ~ISGID;
+
+	/*
+	 * Make sure inode goes to disk before directory entry.
+	 */
+	if ((error = ext2fs_update(tvp, NULL, NULL, UPDATE_WAIT)) != 0)
+		goto bad;
+	error = ext2fs_direnter(ip, dvp, ulr, cnp);
+	if (error != 0)
+		goto bad;
+	vput(dvp);
+	*vpp = tvp;
+	return (0);
+
+bad:
+	/*
+	 * Write error occurred trying to update the inode
+	 * or the directory so must deallocate the inode.
+	 */
+	tvp->v_type = VNON;	/* Stop explosion if VBLK */
+	ip->i_e2fs_nlink = 0;
+	ip->i_flag |= IN_CHANGE;
+	vput(tvp);
+	vput(dvp);
+	return (error);
+}
+
+/*
+ * Reclaim an inode so that it can be used for other purposes.
+ */
+int
+ext2fs_reclaim(void *v)
+{
+	struct vop_reclaim_args /* {
+		struct vnode *a_vp;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	int error;
+
+	/*
+	 * The inode must be freed and updated before being removed
+	 * from its hash chain.  Other threads trying to gain a hold
+	 * on the inode will be stalled because it is locked (VI_XLOCK).
+	 */
+	if (ip->i_omode == 1 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
+		ext2fs_vfree(vp, ip->i_number, ip->i_e2fs_mode);
+	if ((error = ufs_reclaim(vp)) != 0)
+		return (error);
+	if (ip->i_din.e2fs_din != NULL)
+		pool_put(&ext2fs_dinode_pool, ip->i_din.e2fs_din);
+	genfs_node_destroy(vp);
+	pool_put(&ext2fs_inode_pool, vp->v_data);
+	vp->v_data = NULL;
+	return (0);
+}
+
+/* Global vfs data structures for ext2fs. */
+int (**ext2fs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc ext2fs_vnodeop_entries[] = {
+	{ &vop_default_desc, vn_default_error },
+	{ &vop_lookup_desc, ext2fs_lookup },		/* lookup */
+	{ &vop_create_desc, ext2fs_create },		/* create */
+	{ &vop_mknod_desc, ext2fs_mknod },		/* mknod */
+	{ &vop_open_desc, ext2fs_open },		/* open */
+	{ &vop_close_desc, ufs_close },			/* close */
+	{ &vop_access_desc, ext2fs_access },		/* access */
+	{ &vop_getattr_desc, ext2fs_getattr },		/* getattr */
+	{ &vop_setattr_desc, ext2fs_setattr },		/* setattr */
+	{ &vop_read_desc, ext2fs_read },		/* read */
+	{ &vop_write_desc, ext2fs_write },		/* write */
+	{ &vop_ioctl_desc, ufs_ioctl },			/* ioctl */
+	{ &vop_fcntl_desc, ufs_fcntl },			/* fcntl */
+	{ &vop_poll_desc, ufs_poll },			/* poll */
+	{ &vop_kqfilter_desc, genfs_kqfilter },		/* kqfilter */
+	{ &vop_revoke_desc, ufs_revoke },		/* revoke */
+	{ &vop_mmap_desc, ufs_mmap },			/* mmap */
+	{ &vop_fsync_desc, ext2fs_fsync },		/* fsync */
+	{ &vop_seek_desc, ufs_seek },			/* seek */
+	{ &vop_remove_desc, ext2fs_remove },		/* remove */
+	{ &vop_link_desc, ext2fs_link },		/* link */
+	{ &vop_rename_desc, ext2fs_rename },		/* rename */
+	{ &vop_mkdir_desc, ext2fs_mkdir },		/* mkdir */
+	{ &vop_rmdir_desc, ext2fs_rmdir },		/* rmdir */
+	{ &vop_symlink_desc, ext2fs_symlink },		/* symlink */
+	{ &vop_readdir_desc, ext2fs_readdir },		/* readdir */
+	{ &vop_readlink_desc, ext2fs_readlink },	/* readlink */
+	{ &vop_abortop_desc, ufs_abortop },		/* abortop */
+	{ &vop_inactive_desc, ext2fs_inactive },	/* inactive */
+	{ &vop_reclaim_desc, ext2fs_reclaim },		/* reclaim */
+	{ &vop_lock_desc, ufs_lock },			/* lock */
+	{ &vop_unlock_desc, ufs_unlock },		/* unlock */
+	{ &vop_bmap_desc, ext2fs_bmap },		/* bmap */
+	{ &vop_strategy_desc, ufs_strategy },		/* strategy */
+	{ &vop_print_desc, ufs_print },			/* print */
+	{ &vop_islocked_desc, ufs_islocked },		/* islocked */
+	{ &vop_pathconf_desc, ufs_pathconf },		/* pathconf */
+	{ &vop_advlock_desc, ext2fs_advlock },		/* advlock */
+	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
+	{ &vop_getpages_desc, genfs_getpages },		/* getpages */
+	{ &vop_putpages_desc, genfs_putpages },		/* putpages */
+	{ NULL, NULL }
+};
+const struct vnodeopv_desc ext2fs_vnodeop_opv_desc =
+	{ &ext2fs_vnodeop_p, ext2fs_vnodeop_entries };
+
+int (**ext2fs_specop_p)(void *);
+const struct vnodeopv_entry_desc ext2fs_specop_entries[] = {
+	{ &vop_default_desc, vn_default_error },
+	{ &vop_lookup_desc, spec_lookup },		/* lookup */
+	{ &vop_create_desc, spec_create },		/* create */
+	{ &vop_mknod_desc, spec_mknod },		/* mknod */
+	{ &vop_open_desc, spec_open },			/* open */
+	{ &vop_close_desc, ufsspec_close },		/* close */
+	{ &vop_access_desc, ext2fs_access },		/* access */
+	{ &vop_getattr_desc, ext2fs_getattr },		/* getattr */
+	{ &vop_setattr_desc, ext2fs_setattr },		/* setattr */
+	{ &vop_read_desc, ufsspec_read },		/* read */
+	{ &vop_write_desc, ufsspec_write },		/* write */
+	{ &vop_ioctl_desc, spec_ioctl },		/* ioctl */
+	{ &vop_fcntl_desc, ufs_fcntl },			/* fcntl */
+	{ &vop_poll_desc, spec_poll },			/* poll */
+	{ &vop_kqfilter_desc, spec_kqfilter },		/* kqfilter */
+	{ &vop_revoke_desc, spec_revoke },		/* revoke */
+	{ &vop_mmap_desc, spec_mmap },			/* mmap */
+	{ &vop_fsync_desc, ext2fs_fsync },		/* fsync */
+	{ &vop_seek_desc, spec_seek },			/* seek */
+	{ &vop_remove_desc, spec_remove },		/* remove */
+	{ &vop_link_desc, spec_link },			/* link */
+	{ &vop_rename_desc, spec_rename },		/* rename */
+	{ &vop_mkdir_desc, spec_mkdir },		/* mkdir */
+	{ &vop_rmdir_desc, spec_rmdir },		/* rmdir */
+	{ &vop_symlink_desc, spec_symlink },		/* symlink */
+	{ &vop_readdir_desc, spec_readdir },		/* readdir */
+	{ &vop_readlink_desc, spec_readlink },		/* readlink */
+	{ &vop_abortop_desc, spec_abortop },		/* abortop */
+	{ &vop_inactive_desc, ext2fs_inactive },	/* inactive */
+	{ &vop_reclaim_desc, ext2fs_reclaim },		/* reclaim */
+	{ &vop_lock_desc, ufs_lock },			/* lock */
+	{ &vop_unlock_desc, ufs_unlock },		/* unlock */
+	{ &vop_bmap_desc, spec_bmap },			/* bmap */
+	{ &vop_strategy_desc, spec_strategy },		/* strategy */
+	{ &vop_print_desc, ufs_print },			/* print */
+	{ &vop_islocked_desc, ufs_islocked },		/* islocked */
+	{ &vop_pathconf_desc, spec_pathconf },		/* pathconf */
+	{ &vop_advlock_desc, spec_advlock },		/* advlock */
+	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
+	{ &vop_getpages_desc, spec_getpages },		/* getpages */
+	{ &vop_putpages_desc, spec_putpages },		/* putpages */
+	{ NULL, NULL }
+};
+const struct vnodeopv_desc ext2fs_specop_opv_desc =
+	{ &ext2fs_specop_p, ext2fs_specop_entries };
+
+int (**ext2fs_fifoop_p)(void *);
+const struct vnodeopv_entry_desc ext2fs_fifoop_entries[] = {
+	{ &vop_default_desc, vn_default_error },
+	{ &vop_lookup_desc, vn_fifo_bypass },		/* lookup */
+	{ &vop_create_desc, vn_fifo_bypass },		/* create */
+	{ &vop_mknod_desc, vn_fifo_bypass },		/* mknod */
+	{ &vop_open_desc, vn_fifo_bypass },		/* open */
+	{ &vop_close_desc, ufsfifo_close },		/* close */
+	{ &vop_access_desc, ext2fs_access },		/* access */
+	{ &vop_getattr_desc, ext2fs_getattr },		/* getattr */
+	{ &vop_setattr_desc, ext2fs_setattr },		/* setattr */
+	{ &vop_read_desc, ufsfifo_read },		/* read */
+	{ &vop_write_desc, ufsfifo_write },		/* write */
+	{ &vop_ioctl_desc, vn_fifo_bypass },		/* ioctl */
+	{ &vop_fcntl_desc, ufs_fcntl },			/* fcntl */
+	{ &vop_poll_desc, vn_fifo_bypass },		/* poll */
+	{ &vop_kqfilter_desc, vn_fifo_bypass },		/* kqfilter */
+	{ &vop_revoke_desc, vn_fifo_bypass },		/* revoke */
+	{ &vop_mmap_desc, vn_fifo_bypass },		/* mmap */
+	{ &vop_fsync_desc, ext2fs_fsync },		/* fsync */
+	{ &vop_seek_desc, vn_fifo_bypass },		/* seek */
+	{ &vop_remove_desc, vn_fifo_bypass },		/* remove */
+	{ &vop_link_desc, vn_fifo_bypass },		/* link */
+	{ &vop_rename_desc, vn_fifo_bypass },		/* rename */
+	{ &vop_mkdir_desc, vn_fifo_bypass },		/* mkdir */
+	{ &vop_rmdir_desc, vn_fifo_bypass },		/* rmdir */
+	{ &vop_symlink_desc, vn_fifo_bypass },		/* symlink */
+	{ &vop_readdir_desc, vn_fifo_bypass },		/* readdir */
+	{ &vop_readlink_desc, vn_fifo_bypass },		/* readlink */
+	{ &vop_abortop_desc, vn_fifo_bypass },		/* abortop */
+	{ &vop_inactive_desc, ext2fs_inactive },	/* inactive */
+	{ &vop_reclaim_desc, ext2fs_reclaim },		/* reclaim */
+	{ &vop_lock_desc, ufs_lock },			/* lock */
+	{ &vop_unlock_desc, ufs_unlock },		/* unlock */
+	{ &vop_bmap_desc, vn_fifo_bypass },		/* bmap */
+	{ &vop_strategy_desc, vn_fifo_bypass },		/* strategy */
+	{ &vop_print_desc, ufs_print },			/* print */
+	{ &vop_islocked_desc, ufs_islocked },		/* islocked */
+	{ &vop_pathconf_desc, vn_fifo_bypass },		/* pathconf */
+	{ &vop_advlock_desc, vn_fifo_bypass },		/* advlock */
+	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
+	{ &vop_putpages_desc, vn_fifo_bypass },		/* putpages */
+	{ NULL, NULL }
+};
+const struct vnodeopv_desc ext2fs_fifoop_opv_desc =
+	{ &ext2fs_fifoop_p, ext2fs_fifoop_entries };
diff --git a/sys/ufs/ffs/Makefile b/sys/ufs/ffs/Makefile
new file mode 100644
index 000000000..1f03afcc4
--- /dev/null
+++ b/sys/ufs/ffs/Makefile
@@ -0,0 +1,7 @@
+#	$NetBSD: Makefile,v 1.1 1998/06/12 23:23:11 cgd Exp $
+
+INCSDIR= /usr/include/ufs/ffs
+
+INCS=	ffs_extern.h fs.h
+
+.include <bsd.kinc.mk>
diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
new file mode 100644
index 000000000..411f1a83e
--- /dev/null
+++ b/sys/ufs/ffs/ffs_alloc.c
@@ -0,0 +1,2030 @@
+/*	$NetBSD: ffs_alloc.c,v 1.130 2011/11/28 08:05:07 tls Exp $	*/
+
+/*-
+ * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2002 Networks Associates Technology, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Marshall
+ * Kirk McKusick and Network Associates Laboratories, the Security
+ * Research Division of Network Associates, Inc. under DARPA/SPAWAR
+ * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
+ * research program
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ffs_alloc.c	8.19 (Berkeley) 7/13/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.130 2011/11/28 08:05:07 tls Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#include "opt_uvm_page_trkown.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/cprng.h>
+#include <sys/fstrans.h>
+#include <sys/kauth.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/syslog.h>
+#include <sys/vnode.h>
+#include <sys/wapbl.h>
+
+#include <miscfs/specfs/specdev.h>
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#ifdef UVM_PAGE_TRKOWN
+#include <uvm/uvm.h>
+#endif
+
+static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int, int);
+static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int);
+static ino_t ffs_dirpref(struct inode *);
+static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int);
+static void ffs_fserr(struct fs *, u_int, const char *);
+static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int,
+    daddr_t (*)(struct inode *, int, daddr_t, int, int));
+static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int);
+static int32_t ffs_mapsearch(struct fs *, struct cg *,
+				      daddr_t, int);
+static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *,
+    daddr_t, long, bool);
+static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t,
+    int, bool);
+
+/* if 1, changes in optimalization strategy are logged */
+int ffs_log_changeopt = 0;
+
+/* in ffs_tables.c */
+extern const int inside[], around[];
+extern const u_char * const fragtbl[];
+
+/* Basic consistency check for block allocations */
+static int
+ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno,
+    long size, dev_t dev, ino_t inum)
+{
+	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
+	    fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
+		printf("dev = 0x%llx, bno = %" PRId64 " bsize = %d, "
+		    "size = %ld, fs = %s\n",
+		    (long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
+		panic("%s: bad size", func);
+	}
+
+	if (bno >= fs->fs_size) {
+		printf("bad block %" PRId64 ", ino %llu\n", bno,
+		    (unsigned long long)inum);
+		ffs_fserr(fs, inum, "bad block");
+		return EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Allocate a block in the file system.
+ *
+ * The size of the requested block is given, which must be some
+ * multiple of fs_fsize and <= fs_bsize.
+ * A preference may be optionally specified. If a preference is given
+ * the following hierarchy is used to allocate a block:
+ *   1) allocate the requested block.
+ *   2) allocate a rotationally optimal block in the same cylinder.
+ *   3) allocate a block in the same cylinder group.
+ *   4) quadradically rehash into other cylinder groups, until an
+ *      available block is located.
+ * If no block preference is given the following hierarchy is used
+ * to allocate a block:
+ *   1) allocate a block in the cylinder group that contains the
+ *      inode for the file.
+ *   2) quadradically rehash into other cylinder groups, until an
+ *      available block is located.
+ *
+ * => called with um_lock held
+ * => releases um_lock before returning
+ */
+int
+ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, int flags,
+    kauth_cred_t cred, daddr_t *bnp)
+{
+	struct ufsmount *ump;
+	struct fs *fs;
+	daddr_t bno;
+	int cg;
+#if defined(QUOTA) || defined(QUOTA2)
+	int error;
+#endif
+
+	fs = ip->i_fs;
+	ump = ip->i_ump;
+
+	KASSERT(mutex_owned(&ump->um_lock));
+
+#ifdef UVM_PAGE_TRKOWN
+
+	/*
+	 * Sanity-check that allocations within the file size
+	 * do not allow other threads to read the stale contents
+	 * of newly allocated blocks.
+	 * Usually pages will exist to cover the new allocation.
+	 * There is an optimization in ffs_write() where we skip
+	 * creating pages if several conditions are met:
+	 *  - the file must not be mapped (in any user address space).
+	 *  - the write must cover whole pages and whole blocks.
+	 * If those conditions are not met then pages must exist and
+	 * be locked by the current thread.
+	 */
+
+	if (ITOV(ip)->v_type == VREG &&
+	    lblktosize(fs, (voff_t)lbn) < round_page(ITOV(ip)->v_size)) {
+		struct vm_page *pg;
+		struct vnode *vp = ITOV(ip);
+		struct uvm_object *uobj = &vp->v_uobj;
+		voff_t off = trunc_page(lblktosize(fs, lbn));
+		voff_t endoff = round_page(lblktosize(fs, lbn) + size);
+
+		mutex_enter(uobj->vmobjlock);
+		while (off < endoff) {
+			pg = uvm_pagelookup(uobj, off);
+			KASSERT((pg == NULL && (vp->v_vflag & VV_MAPPED) == 0 &&
+				 (size & PAGE_MASK) == 0 && 
+				 blkoff(fs, size) == 0) ||
+				(pg != NULL && pg->owner == curproc->p_pid &&
+				 pg->lowner == curlwp->l_lid));
+			off += PAGE_SIZE;
+		}
+		mutex_exit(uobj->vmobjlock);
+	}
+#endif
+
+	*bnp = 0;
+#ifdef DIAGNOSTIC
+	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
+		printf("dev = 0x%llx, bsize = %d, size = %d, fs = %s\n",
+		    (unsigned long long)ip->i_dev, fs->fs_bsize, size,
+		    fs->fs_fsmnt);
+		panic("ffs_alloc: bad size");
+	}
+	if (cred == NOCRED)
+		panic("ffs_alloc: missing credential");
+#endif /* DIAGNOSTIC */
+	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
+		goto nospace;
+	if (freespace(fs, fs->fs_minfree) <= 0 &&
+	    kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
+	    NULL, NULL) != 0)
+		goto nospace;
+#if defined(QUOTA) || defined(QUOTA2)
+	mutex_exit(&ump->um_lock);
+	if ((error = chkdq(ip, btodb(size), cred, 0)) != 0)
+		return (error);
+	mutex_enter(&ump->um_lock);
+#endif
+
+	if (bpref >= fs->fs_size)
+		bpref = 0;
+	if (bpref == 0)
+		cg = ino_to_cg(fs, ip->i_number);
+	else
+		cg = dtog(fs, bpref);
+	bno = ffs_hashalloc(ip, cg, bpref, size, flags, ffs_alloccg);
+	if (bno > 0) {
+		DIP_ADD(ip, blocks, btodb(size));
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		*bnp = bno;
+		return (0);
+	}
+#if defined(QUOTA) || defined(QUOTA2)
+	/*
+	 * Restore user's disk quota because allocation failed.
+	 */
+	(void) chkdq(ip, -btodb(size), cred, FORCE);
+#endif
+	if (flags & B_CONTIG) {
+		/*
+		 * XXX ump->um_lock handling is "suspect" at best.
+		 * For the case where ffs_hashalloc() fails early
+		 * in the B_CONTIG case we reach here with um_lock
+		 * already unlocked, so we can't release it again
+		 * like in the normal error path.  See kern/39206.
+		 *
+		 *
+		 * Fail silently - it's up to our caller to report
+		 * errors.
+		 */
+		return (ENOSPC);
+	}
+nospace:
+	mutex_exit(&ump->um_lock);
+	ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full");
+	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
+	return (ENOSPC);
+}
+
+/*
+ * Reallocate a fragment to a bigger size
+ *
+ * The number and size of the old block is given, and a preference
+ * and new size is also specified. The allocator attempts to extend
+ * the original block. Failing that, the regular block allocator is
+ * invoked to get an appropriate block.
+ *
+ * => called with um_lock held
+ * => return with um_lock released
+ */
+int
+ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bpref, int osize,
+    int nsize, kauth_cred_t cred, struct buf **bpp, daddr_t *blknop)
+{
+	struct ufsmount *ump;
+	struct fs *fs;
+	struct buf *bp;
+	int cg, request, error;
+	daddr_t bprev, bno;
+
+	fs = ip->i_fs;
+	ump = ip->i_ump;
+
+	KASSERT(mutex_owned(&ump->um_lock));
+
+#ifdef UVM_PAGE_TRKOWN
+
+	/*
+	 * Sanity-check that allocations within the file size
+	 * do not allow other threads to read the stale contents
+	 * of newly allocated blocks.
+	 * Unlike in ffs_alloc(), here pages must always exist
+	 * for such allocations, because only the last block of a file
+	 * can be a fragment and ffs_write() will reallocate the
+	 * fragment to the new size using ufs_balloc_range(),
+	 * which always creates pages to cover blocks it allocates.
+	 */
+
+	if (ITOV(ip)->v_type == VREG) {
+		struct vm_page *pg;
+		struct uvm_object *uobj = &ITOV(ip)->v_uobj;
+		voff_t off = trunc_page(lblktosize(fs, lbprev));
+		voff_t endoff = round_page(lblktosize(fs, lbprev) + osize);
+
+		mutex_enter(uobj->vmobjlock);
+		while (off < endoff) {
+			pg = uvm_pagelookup(uobj, off);
+			KASSERT(pg->owner == curproc->p_pid &&
+				pg->lowner == curlwp->l_lid);
+			off += PAGE_SIZE;
+		}
+		mutex_exit(uobj->vmobjlock);
+	}
+#endif
+
+#ifdef DIAGNOSTIC
+	if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
+	    (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
+		printf(
+		    "dev = 0x%llx, bsize = %d, osize = %d, nsize = %d, fs = %s\n",
+		    (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
+		    fs->fs_fsmnt);
+		panic("ffs_realloccg: bad size");
+	}
+	if (cred == NOCRED)
+		panic("ffs_realloccg: missing credential");
+#endif /* DIAGNOSTIC */
+	if (freespace(fs, fs->fs_minfree) <= 0 &&
+	    kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
+	    NULL, NULL) != 0) {
+		mutex_exit(&ump->um_lock);
+		goto nospace;
+	}
+	if (fs->fs_magic == FS_UFS2_MAGIC)
+		bprev = ufs_rw64(ip->i_ffs2_db[lbprev], UFS_FSNEEDSWAP(fs));
+	else
+		bprev = ufs_rw32(ip->i_ffs1_db[lbprev], UFS_FSNEEDSWAP(fs));
+
+	if (bprev == 0) {
+		printf("dev = 0x%llx, bsize = %d, bprev = %" PRId64 ", fs = %s\n",
+		    (unsigned long long)ip->i_dev, fs->fs_bsize, bprev,
+		    fs->fs_fsmnt);
+		panic("ffs_realloccg: bad bprev");
+	}
+	mutex_exit(&ump->um_lock);
+
+	/*
+	 * Allocate the extra space in the buffer.
+	 */
+	if (bpp != NULL &&
+	    (error = bread(ITOV(ip), lbprev, osize, NOCRED, 0, &bp)) != 0) {
+		brelse(bp, 0);
+		return (error);
+	}
+#if defined(QUOTA) || defined(QUOTA2)
+	if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) {
+		if (bpp != NULL) {
+			brelse(bp, 0);
+		}
+		return (error);
+	}
+#endif
+	/*
+	 * Check for extension in the existing location.
+	 */
+	cg = dtog(fs, bprev);
+	mutex_enter(&ump->um_lock);
+	if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) {
+		DIP_ADD(ip, blocks, btodb(nsize - osize));
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+
+		if (bpp != NULL) {
+			if (bp->b_blkno != fsbtodb(fs, bno))
+				panic("bad blockno");
+			allocbuf(bp, nsize, 1);
+			memset((char *)bp->b_data + osize, 0, nsize - osize);
+			mutex_enter(bp->b_objlock);
+			KASSERT(!cv_has_waiters(&bp->b_done));
+			bp->b_oflags |= BO_DONE;
+			mutex_exit(bp->b_objlock);
+			*bpp = bp;
+		}
+		if (blknop != NULL) {
+			*blknop = bno;
+		}
+		return (0);
+	}
+	/*
+	 * Allocate a new disk location.
+	 */
+	if (bpref >= fs->fs_size)
+		bpref = 0;
+	switch ((int)fs->fs_optim) {
+	case FS_OPTSPACE:
+		/*
+		 * Allocate an exact sized fragment. Although this makes
+		 * best use of space, we will waste time relocating it if
+		 * the file continues to grow. If the fragmentation is
+		 * less than half of the minimum free reserve, we choose
+		 * to begin optimizing for time.
+		 */
+		request = nsize;
+		if (fs->fs_minfree < 5 ||
+		    fs->fs_cstotal.cs_nffree >
+		    fs->fs_dsize * fs->fs_minfree / (2 * 100))
+			break;
+
+		if (ffs_log_changeopt) {
+			log(LOG_NOTICE,
+				"%s: optimization changed from SPACE to TIME\n",
+				fs->fs_fsmnt);
+		}
+
+		fs->fs_optim = FS_OPTTIME;
+		break;
+	case FS_OPTTIME:
+		/*
+		 * At this point we have discovered a file that is trying to
+		 * grow a small fragment to a larger fragment. To save time,
+		 * we allocate a full sized block, then free the unused portion.
+		 * If the file continues to grow, the `ffs_fragextend' call
+		 * above will be able to grow it in place without further
+		 * copying. If aberrant programs cause disk fragmentation to
+		 * grow within 2% of the free reserve, we choose to begin
+		 * optimizing for space.
+		 */
+		request = fs->fs_bsize;
+		if (fs->fs_cstotal.cs_nffree <
+		    fs->fs_dsize * (fs->fs_minfree - 2) / 100)
+			break;
+
+		if (ffs_log_changeopt) {
+			log(LOG_NOTICE,
+				"%s: optimization changed from TIME to SPACE\n",
+				fs->fs_fsmnt);
+		}
+
+		fs->fs_optim = FS_OPTSPACE;
+		break;
+	default:
+		printf("dev = 0x%llx, optim = %d, fs = %s\n",
+		    (unsigned long long)ip->i_dev, fs->fs_optim, fs->fs_fsmnt);
+		panic("ffs_realloccg: bad optim");
+		/* NOTREACHED */
+	}
+	bno = ffs_hashalloc(ip, cg, bpref, request, 0, ffs_alloccg);
+	if (bno > 0) {
+		if ((ip->i_ump->um_mountp->mnt_wapbl) &&
+		    (ITOV(ip)->v_type != VREG)) {
+			UFS_WAPBL_REGISTER_DEALLOCATION(
+			    ip->i_ump->um_mountp, fsbtodb(fs, bprev),
+			    osize);
+		} else {
+			ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize,
+			    ip->i_number);
+		}
+		if (nsize < request) {
+			if ((ip->i_ump->um_mountp->mnt_wapbl) &&
+			    (ITOV(ip)->v_type != VREG)) {
+				UFS_WAPBL_REGISTER_DEALLOCATION(
+				    ip->i_ump->um_mountp,
+				    fsbtodb(fs, (bno + numfrags(fs, nsize))),
+				    request - nsize);
+			} else
+				ffs_blkfree(fs, ip->i_devvp,
+				    bno + numfrags(fs, nsize),
+				    (long)(request - nsize), ip->i_number);
+		}
+		DIP_ADD(ip, blocks, btodb(nsize - osize));
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		if (bpp != NULL) {
+			bp->b_blkno = fsbtodb(fs, bno);
+			allocbuf(bp, nsize, 1);
+			memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize);
+			mutex_enter(bp->b_objlock);
+			KASSERT(!cv_has_waiters(&bp->b_done));
+			bp->b_oflags |= BO_DONE;
+			mutex_exit(bp->b_objlock);
+			*bpp = bp;
+		}
+		if (blknop != NULL) {
+			*blknop = bno;
+		}
+		return (0);
+	}
+	mutex_exit(&ump->um_lock);
+
+#if defined(QUOTA) || defined(QUOTA2)
+	/*
+	 * Restore user's disk quota because allocation failed.
+	 */
+	(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
+#endif
+	if (bpp != NULL) {
+		brelse(bp, 0);
+	}
+
+nospace:
+	/*
+	 * no space available
+	 */
+	ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full");
+	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
+	return (ENOSPC);
+}
+
+/*
+ * Allocate an inode in the file system.
+ *
+ * If allocating a directory, use ffs_dirpref to select the inode.
+ * If allocating in a directory, the following hierarchy is followed:
+ *   1) allocate the preferred inode.
+ *   2) allocate an inode in the same cylinder group.
+ *   3) quadradically rehash into other cylinder groups, until an
+ *      available inode is located.
+ * If no inode preference is given the following hierarchy is used
+ * to allocate an inode:
+ *   1) allocate an inode in cylinder group 0.
+ *   2) quadradically rehash into other cylinder groups, until an
+ *      available inode is located.
+ *
+ * => um_lock not held upon entry or return
+ */
+int
+ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
+    struct vnode **vpp)
+{
+	struct ufsmount *ump;
+	struct inode *pip;
+	struct fs *fs;
+	struct inode *ip;
+	struct timespec ts;
+	ino_t ino, ipref;
+	int cg, error;
+
+	UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount);
+
+	*vpp = NULL;
+	pip = VTOI(pvp);
+	fs = pip->i_fs;
+	ump = pip->i_ump;
+
+	error = UFS_WAPBL_BEGIN(pvp->v_mount);
+	if (error) {
+		return error;
+	}
+	mutex_enter(&ump->um_lock);
+	if (fs->fs_cstotal.cs_nifree == 0)
+		goto noinodes;
+
+	if ((mode & IFMT) == IFDIR)
+		ipref = ffs_dirpref(pip);
+	else
+		ipref = pip->i_number;
+	if (ipref >= fs->fs_ncg * fs->fs_ipg)
+		ipref = 0;
+	cg = ino_to_cg(fs, ipref);
+	/*
+	 * Track number of dirs created one after another
+	 * in a same cg without intervening by files.
+	 */
+	if ((mode & IFMT) == IFDIR) {
+		if (fs->fs_contigdirs[cg] < 255)
+			fs->fs_contigdirs[cg]++;
+	} else {
+		if (fs->fs_contigdirs[cg] > 0)
+			fs->fs_contigdirs[cg]--;
+	}
+	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, ffs_nodealloccg);
+	if (ino == 0)
+		goto noinodes;
+	UFS_WAPBL_END(pvp->v_mount);
+	error = VFS_VGET(pvp->v_mount, ino, vpp);
+	if (error) {
+		int err;
+		err = UFS_WAPBL_BEGIN(pvp->v_mount);
+		if (err == 0)
+			ffs_vfree(pvp, ino, mode);
+		if (err == 0)
+			UFS_WAPBL_END(pvp->v_mount);
+		return (error);
+	}
+	KASSERT((*vpp)->v_type == VNON);
+	ip = VTOI(*vpp);
+	if (ip->i_mode) {
+#if 0
+		printf("mode = 0%o, inum = %d, fs = %s\n",
+		    ip->i_mode, ip->i_number, fs->fs_fsmnt);
+#else
+		printf("dmode %x mode %x dgen %x gen %x\n",
+		    DIP(ip, mode), ip->i_mode,
+		    DIP(ip, gen), ip->i_gen);
+		printf("size %llx blocks %llx\n",
+		    (long long)DIP(ip, size), (long long)DIP(ip, blocks));
+		printf("ino %llu ipref %llu\n", (unsigned long long)ino,
+		    (unsigned long long)ipref);
+#if 0
+		error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
+		    (int)fs->fs_bsize, NOCRED, 0, &bp);
+#endif
+
+#endif
+		panic("ffs_valloc: dup alloc");
+	}
+	if (DIP(ip, blocks)) {				/* XXX */
+		printf("free inode %s/%llu had %" PRId64 " blocks\n",
+		    fs->fs_fsmnt, (unsigned long long)ino, DIP(ip, blocks));
+		DIP_ASSIGN(ip, blocks, 0);
+	}
+	ip->i_flag &= ~IN_SPACECOUNTED;
+	ip->i_flags = 0;
+	DIP_ASSIGN(ip, flags, 0);
+	/*
+	 * Set up a new generation number for this inode.
+	 */
+	ip->i_gen++;
+	DIP_ASSIGN(ip, gen, ip->i_gen);
+	if (fs->fs_magic == FS_UFS2_MAGIC) {
+		vfs_timestamp(&ts);
+		ip->i_ffs2_birthtime = ts.tv_sec;
+		ip->i_ffs2_birthnsec = ts.tv_nsec;
+	}
+	return (0);
+noinodes:
+	mutex_exit(&ump->um_lock);
+	UFS_WAPBL_END(pvp->v_mount);
+	ffs_fserr(fs, kauth_cred_geteuid(cred), "out of inodes");
+	uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
+	return (ENOSPC);
+}
+
+/*
+ * Find a cylinder group in which to place a directory.
+ *
+ * The policy implemented by this algorithm is to allocate a
+ * directory inode in the same cylinder group as its parent
+ * directory, but also to reserve space for its files inodes
+ * and data. Restrict the number of directories which may be
+ * allocated one after another in the same cylinder group
+ * without intervening allocation of files.
+ *
+ * If we allocate a first level directory then force allocation
+ * in another cylinder group.
+ */
+static ino_t
+ffs_dirpref(struct inode *pip)
+{
+	register struct fs *fs;
+	int cg, prefcg;
+	int64_t dirsize, cgsize, curdsz;
+	int avgifree, avgbfree, avgndir;
+	int minifree, minbfree, maxndir;
+	int mincg, minndir;
+	int maxcontigdirs;
+
+	KASSERT(mutex_owned(&pip->i_ump->um_lock));
+
+	fs = pip->i_fs;
+
+	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
+	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
+	avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
+
+	/*
+	 * Force allocation in another cg if creating a first level dir.
+	 */
+	if (ITOV(pip)->v_vflag & VV_ROOT) {
+		prefcg = random() % fs->fs_ncg;
+		mincg = prefcg;
+		minndir = fs->fs_ipg;
+		for (cg = prefcg; cg < fs->fs_ncg; cg++)
+			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
+			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
+			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+				mincg = cg;
+				minndir = fs->fs_cs(fs, cg).cs_ndir;
+			}
+		for (cg = 0; cg < prefcg; cg++)
+			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
+			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
+			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+				mincg = cg;
+				minndir = fs->fs_cs(fs, cg).cs_ndir;
+			}
+		return ((ino_t)(fs->fs_ipg * mincg));
+	}
+
+	/*
+	 * Count various limits which used for
+	 * optimal allocation of a directory inode.
+	 */
+	maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
+	minifree = avgifree - fs->fs_ipg / 4;
+	if (minifree < 0)
+		minifree = 0;
+	minbfree = avgbfree - fragstoblks(fs, fs->fs_fpg) / 4;
+	if (minbfree < 0)
+		minbfree = 0;
+	cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg;
+	dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir;
+	if (avgndir != 0) {
+		curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir;
+		if (dirsize < curdsz)
+			dirsize = curdsz;
+	}
+	if (cgsize < dirsize * 255)
+		maxcontigdirs = cgsize / dirsize;
+	else
+		maxcontigdirs = 255;
+	if (fs->fs_avgfpdir > 0)
+		maxcontigdirs = min(maxcontigdirs,
+				    fs->fs_ipg / fs->fs_avgfpdir);
+	if (maxcontigdirs == 0)
+		maxcontigdirs = 1;
+
+	/*
+	 * Limit number of dirs in one cg and reserve space for
+	 * regular files, but only if we have no deficit in
+	 * inodes or space.
+	 */
+	prefcg = ino_to_cg(fs, pip->i_number);
+	for (cg = prefcg; cg < fs->fs_ncg; cg++)
+		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
+		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
+	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
+			if (fs->fs_contigdirs[cg] < maxcontigdirs)
+				return ((ino_t)(fs->fs_ipg * cg));
+		}
+	for (cg = 0; cg < prefcg; cg++)
+		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
+		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
+	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
+			if (fs->fs_contigdirs[cg] < maxcontigdirs)
+				return ((ino_t)(fs->fs_ipg * cg));
+		}
+	/*
+	 * This is a backstop when we are deficient in space.
+	 */
+	for (cg = prefcg; cg < fs->fs_ncg; cg++)
+		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
+			return ((ino_t)(fs->fs_ipg * cg));
+	for (cg = 0; cg < prefcg; cg++)
+		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
+			break;
+	return ((ino_t)(fs->fs_ipg * cg));
+}
+
+/*
+ * Select the desired position for the next block in a file.  The file is
+ * logically divided into sections. The first section is composed of the
+ * direct blocks. Each additional section contains fs_maxbpg blocks.
+ *
+ * If no blocks have been allocated in the first section, the policy is to
+ * request a block in the same cylinder group as the inode that describes
+ * the file. If no blocks have been allocated in any other section, the
+ * policy is to place the section in a cylinder group with a greater than
+ * average number of free blocks.  An appropriate cylinder group is found
+ * by using a rotor that sweeps the cylinder groups. When a new group of
+ * blocks is needed, the sweep begins in the cylinder group following the
+ * cylinder group from which the previous allocation was made. The sweep
+ * continues until a cylinder group with greater than the average number
+ * of free blocks is found. If the allocation is for the first block in an
+ * indirect block, the information on the previous allocation is unavailable;
+ * here a best guess is made based upon the logical block number being
+ * allocated.
+ *
+ * If a section is already partially allocated, the policy is to
+ * contiguously allocate fs_maxcontig blocks.  The end of one of these
+ * contiguous blocks and the beginning of the next is laid out
+ * contigously if possible.
+ *
+ * => um_lock held on entry and exit
+ */
+daddr_t
+ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags,
+    int32_t *bap /* XXX ondisk32 */)
+{
+	struct fs *fs;
+	int cg;
+	int avgbfree, startcg;
+
+	KASSERT(mutex_owned(&ip->i_ump->um_lock));
+
+	fs = ip->i_fs;
+
+	/*
+	 * If allocating a contiguous file with B_CONTIG, use the hints
+	 * in the inode extentions to return the desired block.
+	 *
+	 * For metadata (indirect blocks) return the address of where
+	 * the first indirect block resides - we'll scan for the next
+	 * available slot if we need to allocate more than one indirect
+	 * block.  For data, return the address of the actual block
+	 * relative to the address of the first data block.
+	 */
+	if (flags & B_CONTIG) {
+		KASSERT(ip->i_ffs_first_data_blk != 0);
+		KASSERT(ip->i_ffs_first_indir_blk != 0);
+		if (flags & B_METAONLY)
+			return ip->i_ffs_first_indir_blk;
+		else
+			return ip->i_ffs_first_data_blk + blkstofrags(fs, lbn);
+	}
+
+	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
+		if (lbn < NDADDR + NINDIR(fs)) {
+			cg = ino_to_cg(fs, ip->i_number);
+			return (cgbase(fs, cg) + fs->fs_frag);
+		}
+		/*
+		 * Find a cylinder with greater than average number of
+		 * unused data blocks.
+		 */
+		if (indx == 0 || bap[indx - 1] == 0)
+			startcg =
+			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
+		else
+			startcg = dtog(fs,
+				ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
+		startcg %= fs->fs_ncg;
+		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
+		for (cg = startcg; cg < fs->fs_ncg; cg++)
+			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+				return (cgbase(fs, cg) + fs->fs_frag);
+			}
+		for (cg = 0; cg < startcg; cg++)
+			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+				return (cgbase(fs, cg) + fs->fs_frag);
+			}
+		return (0);
+	}
+	/*
+	 * We just always try to lay things out contiguously.
+	 */
+	return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
+}
+
+daddr_t
+ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags,
+    int64_t *bap)
+{
+	struct fs *fs;
+	int cg;
+	int avgbfree, startcg;
+
+	KASSERT(mutex_owned(&ip->i_ump->um_lock));
+
+	fs = ip->i_fs;
+
+	/*
+	 * If allocating a contiguous file with B_CONTIG, use the hints
+	 * in the inode extentions to return the desired block.
+	 *
+	 * For metadata (indirect blocks) return the address of where
+	 * the first indirect block resides - we'll scan for the next
+	 * available slot if we need to allocate more than one indirect
+	 * block.  For data, return the address of the actual block
+	 * relative to the address of the first data block.
+	 */
+	if (flags & B_CONTIG) {
+		KASSERT(ip->i_ffs_first_data_blk != 0);
+		KASSERT(ip->i_ffs_first_indir_blk != 0);
+		if (flags & B_METAONLY)
+			return ip->i_ffs_first_indir_blk;
+		else
+			return ip->i_ffs_first_data_blk + blkstofrags(fs, lbn);
+	}
+
+	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
+		if (lbn < NDADDR + NINDIR(fs)) {
+			cg = ino_to_cg(fs, ip->i_number);
+			return (cgbase(fs, cg) + fs->fs_frag);
+		}
+		/*
+		 * Find a cylinder with greater than average number of
+		 * unused data blocks.
+		 */
+		if (indx == 0 || bap[indx - 1] == 0)
+			startcg =
+			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
+		else
+			startcg = dtog(fs,
+				ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
+		startcg %= fs->fs_ncg;
+		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
+		for (cg = startcg; cg < fs->fs_ncg; cg++)
+			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+				return (cgbase(fs, cg) + fs->fs_frag);
+			}
+		for (cg = 0; cg < startcg; cg++)
+			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+				return (cgbase(fs, cg) + fs->fs_frag);
+			}
+		return (0);
+	}
+	/*
+	 * We just always try to lay things out contiguously.
+	 */
+	return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
+}
+
+
+/*
+ * Implement the cylinder overflow algorithm.
+ *
+ * The policy implemented by this algorithm is:
+ *   1) allocate the block in its requested cylinder group.
+ *   2) quadradically rehash on the cylinder group number.
+ *   3) brute force search for a free block.
+ *
+ * => called with um_lock held
+ * => returns with um_lock released on success, held on failure
+ *    (*allocator releases lock on success, retains lock on failure)
+ */
+/*VARARGS5*/
+static daddr_t
+ffs_hashalloc(struct inode *ip, int cg, daddr_t pref,
+    int size /* size for data blocks, mode for inodes */,
+    int flags, daddr_t (*allocator)(struct inode *, int, daddr_t, int, int))
+{
+	struct fs *fs;
+	daddr_t result;
+	int i, icg = cg;
+
+	fs = ip->i_fs;
+	/*
+	 * 1: preferred cylinder group
+	 */
+	result = (*allocator)(ip, cg, pref, size, flags);
+	if (result)
+		return (result);
+
+	if (flags & B_CONTIG)
+		return (result);
+	/*
+	 * 2: quadratic rehash
+	 */
+	for (i = 1; i < fs->fs_ncg; i *= 2) {
+		cg += i;
+		if (cg >= fs->fs_ncg)
+			cg -= fs->fs_ncg;
+		result = (*allocator)(ip, cg, 0, size, flags);
+		if (result)
+			return (result);
+	}
+	/*
+	 * 3: brute force search
+	 * Note that we start at i == 2, since 0 was checked initially,
+	 * and 1 is always checked in the quadratic rehash.
+	 */
+	cg = (icg + 2) % fs->fs_ncg;
+	for (i = 2; i < fs->fs_ncg; i++) {
+		result = (*allocator)(ip, cg, 0, size, flags);
+		if (result)
+			return (result);
+		cg++;
+		if (cg == fs->fs_ncg)
+			cg = 0;
+	}
+	return (0);
+}
+
+/*
+ * Determine whether a fragment can be extended.
+ *
+ * Check to see if the necessary fragments are available, and
+ * if they are, allocate them.
+ *
+ * => called with um_lock held
+ * => returns with um_lock released on success, held on failure
+ */
+static daddr_t
+ffs_fragextend(struct inode *ip, int cg, daddr_t bprev, int osize, int nsize)
+{
+	struct ufsmount *ump;
+	struct fs *fs;
+	struct cg *cgp;
+	struct buf *bp;
+	daddr_t bno;
+	int frags, bbase;
+	int i, error;
+	u_int8_t *blksfree;
+
+	fs = ip->i_fs;
+	ump = ip->i_ump;
+
+	KASSERT(mutex_owned(&ump->um_lock));
+
+	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
+		return (0);
+	frags = numfrags(fs, nsize);
+	bbase = fragnum(fs, bprev);
+	if (bbase > fragnum(fs, (bprev + frags - 1))) {
+		/* cannot extend across a block boundary */
+		return (0);
+	}
+	mutex_exit(&ump->um_lock);
+	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+		(int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
+	if (error)
+		goto fail;
+	cgp = (struct cg *)bp->b_data;
+	if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs)))
+		goto fail;
+	cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs));
+	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
+		cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs));
+	bno = dtogd(fs, bprev);
+	blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs));
+	for (i = numfrags(fs, osize); i < frags; i++)
+		if (isclr(blksfree, bno + i))
+			goto fail;
+	/*
+	 * the current fragment can be extended
+	 * deduct the count on fragment being extended into
+	 * increase the count on the remaining fragment (if any)
+	 * allocate the extended piece
+	 */
+	for (i = frags; i < fs->fs_frag - bbase; i++)
+		if (isclr(blksfree, bno + i))
+			break;
+	ufs_add32(cgp->cg_frsum[i - numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs));
+	if (i != frags)
+		ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs));
+	mutex_enter(&ump->um_lock);
+	for (i = numfrags(fs, osize); i < frags; i++) {
+		clrbit(blksfree, bno + i);
+		ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs));
+		fs->fs_cstotal.cs_nffree--;
+		fs->fs_cs(fs, cg).cs_nffree--;
+	}
+	fs->fs_fmod = 1;
+	ACTIVECG_CLR(fs, cg);
+	mutex_exit(&ump->um_lock);
+	bdwrite(bp);
+	return (bprev);
+
+ fail:
+ 	brelse(bp, 0);
+ 	mutex_enter(&ump->um_lock);
+ 	return (0);
+}
+
+/*
+ * Determine whether a block can be allocated.
+ *
+ * Check to see if a block of the appropriate size is available,
+ * and if it is, allocate it.
+ */
+static daddr_t
+ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size, int flags)
+{
+	struct ufsmount *ump;
+	struct fs *fs = ip->i_fs;
+	struct cg *cgp;
+	struct buf *bp;
+	int32_t bno;
+	daddr_t blkno;
+	int error, frags, allocsiz, i;
+	u_int8_t *blksfree;
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+	ump = ip->i_ump;
+
+	KASSERT(mutex_owned(&ump->um_lock));
+
+	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
+		return (0);
+	mutex_exit(&ump->um_lock);
+	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+		(int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
+	if (error)
+		goto fail;
+	cgp = (struct cg *)bp->b_data;
+	if (!cg_chkmagic(cgp, needswap) ||
+	    (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
+		goto fail;
+	cgp->cg_old_time = ufs_rw32(time_second, needswap);
+	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
+		cgp->cg_time = ufs_rw64(time_second, needswap);
+	if (size == fs->fs_bsize) {
+		mutex_enter(&ump->um_lock);
+		blkno = ffs_alloccgblk(ip, bp, bpref, flags);
+		ACTIVECG_CLR(fs, cg);
+		mutex_exit(&ump->um_lock);
+		bdwrite(bp);
+		return (blkno);
+	}
+	/*
+	 * check to see if any fragments are already available
+	 * allocsiz is the size which will be allocated, hacking
+	 * it down to a smaller size if necessary
+	 */
+	blksfree = cg_blksfree(cgp, needswap);
+	frags = numfrags(fs, size);
+	for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
+		if (cgp->cg_frsum[allocsiz] != 0)
+			break;
+	if (allocsiz == fs->fs_frag) {
+		/*
+		 * no fragments were available, so a block will be
+		 * allocated, and hacked up
+		 */
+		if (cgp->cg_cs.cs_nbfree == 0)
+			goto fail;
+		mutex_enter(&ump->um_lock);
+		blkno = ffs_alloccgblk(ip, bp, bpref, flags);
+		bno = dtogd(fs, blkno);
+		for (i = frags; i < fs->fs_frag; i++)
+			setbit(blksfree, bno + i);
+		i = fs->fs_frag - frags;
+		ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
+		fs->fs_cstotal.cs_nffree += i;
+		fs->fs_cs(fs, cg).cs_nffree += i;
+		fs->fs_fmod = 1;
+		ufs_add32(cgp->cg_frsum[i], 1, needswap);
+		ACTIVECG_CLR(fs, cg);
+		mutex_exit(&ump->um_lock);
+		bdwrite(bp);
+		return (blkno);
+	}
+	bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
+#if 0
+	/*
+	 * XXX fvdl mapsearch will panic, and never return -1
+	 *          also: returning NULL as daddr_t ?
+	 */
+	if (bno < 0)
+		goto fail;
+#endif
+	for (i = 0; i < frags; i++)
+		clrbit(blksfree, bno + i);
+	mutex_enter(&ump->um_lock);
+	ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap);
+	fs->fs_cstotal.cs_nffree -= frags;
+	fs->fs_cs(fs, cg).cs_nffree -= frags;
+	fs->fs_fmod = 1;
+	ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap);
+	if (frags != allocsiz)
+		ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap);
+	blkno = cgbase(fs, cg) + bno;
+	ACTIVECG_CLR(fs, cg);
+	mutex_exit(&ump->um_lock);
+	bdwrite(bp);
+	return blkno;
+
+ fail:
+ 	brelse(bp, 0);
+ 	mutex_enter(&ump->um_lock);
+ 	return (0);
+}
+
+/*
+ * Allocate a block in a cylinder group.
+ *
+ * This algorithm implements the following policy:
+ *   1) allocate the requested block.
+ *   2) allocate a rotationally optimal block in the same cylinder.
+ *   3) allocate the next available block on the block rotor for the
+ *      specified cylinder group.
+ * Note that this routine only allocates fs_bsize blocks; these
+ * blocks may be fragmented by the routine that allocates them.
+ */
+static daddr_t
+ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int flags)
+{
+	struct ufsmount *ump;
+	struct fs *fs = ip->i_fs;
+	struct cg *cgp;
+	int cg;
+	daddr_t blkno;
+	int32_t bno;
+	u_int8_t *blksfree;
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+	ump = ip->i_ump;
+
+	KASSERT(mutex_owned(&ump->um_lock));
+
+	cgp = (struct cg *)bp->b_data;
+	blksfree = cg_blksfree(cgp, needswap);
+	if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) {
+		bpref = ufs_rw32(cgp->cg_rotor, needswap);
+	} else {
+		bpref = blknum(fs, bpref);
+		bno = dtogd(fs, bpref);
+		/*
+		 * if the requested block is available, use it
+		 */
+		if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
+			goto gotit;
+		/*
+		 * if the requested data block isn't available and we are
+		 * trying to allocate a contiguous file, return an error.
+		 */
+		if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG)
+			return (0);
+	}
+
+	/*
+	 * Take the next available block in this cylinder group.
+	 */
+	bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
+	if (bno < 0)
+		return (0);
+	cgp->cg_rotor = ufs_rw32(bno, needswap);
+gotit:
+	blkno = fragstoblks(fs, bno);
+	ffs_clrblock(fs, blksfree, blkno);
+	ffs_clusteracct(fs, cgp, blkno, -1);
+	ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
+	fs->fs_cstotal.cs_nbfree--;
+	fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--;
+	if ((fs->fs_magic == FS_UFS1_MAGIC) &&
+	    ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
+		int cylno;
+		cylno = old_cbtocylno(fs, bno);
+		KASSERT(cylno >= 0);
+		KASSERT(cylno < fs->fs_old_ncyl);
+		KASSERT(old_cbtorpos(fs, bno) >= 0);
+		KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos);
+		ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1,
+		    needswap);
+		ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap);
+	}
+	fs->fs_fmod = 1;
+	cg = ufs_rw32(cgp->cg_cgx, needswap);
+	blkno = cgbase(fs, cg) + bno;
+	return (blkno);
+}
+
+/*
+ * Determine whether an inode can be allocated.
+ *
+ * Check to see if an inode is available, and if it is,
+ * allocate it using the following policy:
+ *   1) allocate the requested inode.
+ *   2) allocate the next available inode after the requested
+ *      inode in the specified cylinder group.
+ */
+static daddr_t
+ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode, int flags)
+{
+	struct ufsmount *ump = ip->i_ump;
+	struct fs *fs = ip->i_fs;
+	struct cg *cgp;
+	struct buf *bp, *ibp;
+	u_int8_t *inosused;
+	int error, start, len, loc, map, i;
+	int32_t initediblk;
+	daddr_t nalloc;
+	struct ufs2_dinode *dp2;
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+	KASSERT(mutex_owned(&ump->um_lock));
+	UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp);
+
+	if (fs->fs_cs(fs, cg).cs_nifree == 0)
+		return (0);
+	mutex_exit(&ump->um_lock);
+	ibp = NULL;
+	initediblk = -1;
+retry:
+	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+		(int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
+	if (error)
+		goto fail;
+	cgp = (struct cg *)bp->b_data;
+	if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0)
+		goto fail;
+
+	if (ibp != NULL &&
+	    initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) {
+		/* Another thread allocated more inodes so we retry the test. */
+		brelse(ibp, 0);
+		ibp = NULL;
+	}
+	/*
+	 * Check to see if we need to initialize more inodes.
+	 */
+	if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) {
+		initediblk = ufs_rw32(cgp->cg_initediblk, needswap);
+		nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap);
+		if (nalloc + INOPB(fs) > initediblk &&
+		    initediblk < ufs_rw32(cgp->cg_niblk, needswap)) {
+			/*
+			 * We have to release the cg buffer here to prevent
+			 * a deadlock when reading the inode block will
+			 * run a copy-on-write that might use this cg.
+			 */
+			brelse(bp, 0);
+			bp = NULL;
+			error = ffs_getblk(ip->i_devvp, fsbtodb(fs,
+			    ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)),
+			    FFS_NOBLK, fs->fs_bsize, false, &ibp);
+			if (error)
+				goto fail;
+			goto retry;
+		}
+	}
+
+	cgp->cg_old_time = ufs_rw32(time_second, needswap);
+	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
+		cgp->cg_time = ufs_rw64(time_second, needswap);
+	inosused = cg_inosused(cgp, needswap);
+	if (ipref) {
+		ipref %= fs->fs_ipg;
+		if (isclr(inosused, ipref))
+			goto gotit;
+	}
+	start = ufs_rw32(cgp->cg_irotor, needswap) / NBBY;
+	len = howmany(fs->fs_ipg - ufs_rw32(cgp->cg_irotor, needswap),
+		NBBY);
+	loc = skpc(0xff, len, &inosused[start]);
+	if (loc == 0) {
+		len = start + 1;
+		start = 0;
+		loc = skpc(0xff, len, &inosused[0]);
+		if (loc == 0) {
+			printf("cg = %d, irotor = %d, fs = %s\n",
+			    cg, ufs_rw32(cgp->cg_irotor, needswap),
+				fs->fs_fsmnt);
+			panic("ffs_nodealloccg: map corrupted");
+			/* NOTREACHED */
+		}
+	}
+	i = start + len - loc;
+	map = inosused[i] ^ 0xff;
+	if (map == 0) {
+		printf("fs = %s\n", fs->fs_fsmnt);
+		panic("ffs_nodealloccg: block not in map");
+	}
+	ipref = i * NBBY + ffs(map) - 1;
+	cgp->cg_irotor = ufs_rw32(ipref, needswap);
+gotit:
+	UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref,
+	    mode);
+	/*
+	 * Check to see if we need to initialize more inodes.
+	 */
+	if (ibp != NULL) {
+		KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap));
+		memset(ibp->b_data, 0, fs->fs_bsize);
+		dp2 = (struct ufs2_dinode *)(ibp->b_data);
+		for (i = 0; i < INOPB(fs); i++) {
+			/*
+			 * Don't bother to swap, it's supposed to be
+			 * random, after all.
+			 */
+			dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1;
+			dp2++;
+		}
+		initediblk += INOPB(fs);
+		cgp->cg_initediblk = ufs_rw32(initediblk, needswap);
+	}
+
+	mutex_enter(&ump->um_lock);
+	ACTIVECG_CLR(fs, cg);
+	setbit(inosused, ipref);
+	ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap);
+	fs->fs_cstotal.cs_nifree--;
+	fs->fs_cs(fs, cg).cs_nifree--;
+	fs->fs_fmod = 1;
+	if ((mode & IFMT) == IFDIR) {
+		ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap);
+		fs->fs_cstotal.cs_ndir++;
+		fs->fs_cs(fs, cg).cs_ndir++;
+	}
+	mutex_exit(&ump->um_lock);
+	if (ibp != NULL) {
+		bwrite(bp);
+		bawrite(ibp);
+	} else
+		bdwrite(bp);
+	return (cg * fs->fs_ipg + ipref);
+ fail:
+	if (bp != NULL)
+		brelse(bp, 0);
+	if (ibp != NULL)
+		brelse(ibp, 0);
+	mutex_enter(&ump->um_lock);
+	return (0);
+}
+
+/*
+ * Allocate a block or fragment.
+ *
+ * The specified block or fragment is removed from the
+ * free map, possibly fragmenting a block in the process.
+ *
+ * This implementation should mirror fs_blkfree
+ *
+ * => um_lock not held on entry or exit
+ */
+int
+ffs_blkalloc(struct inode *ip, daddr_t bno, long size)
+{
+	int error;
+
+	error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size,
+	    ip->i_dev, ip->i_uid);
+	if (error)
+		return error;
+
+	return ffs_blkalloc_ump(ip->i_ump, bno, size);
+}
+
+int
+ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size)
+{
+	struct fs *fs = ump->um_fs;
+	struct cg *cgp;
+	struct buf *bp;
+	int32_t fragno, cgbno;
+	int i, error, cg, blk, frags, bbase;
+	u_int8_t *blksfree;
+	const int needswap = UFS_FSNEEDSWAP(fs);
+
+	KASSERT((u_int)size <= fs->fs_bsize && fragoff(fs, size) == 0 &&
+	    fragnum(fs, bno) + numfrags(fs, size) <= fs->fs_frag);
+	KASSERT(bno < fs->fs_size);
+
+	cg = dtog(fs, bno);
+	error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)),
+		(int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return error;
+	}
+	cgp = (struct cg *)bp->b_data;
+	if (!cg_chkmagic(cgp, needswap)) {
+		brelse(bp, 0);
+		return EIO;
+	}
+	cgp->cg_old_time = ufs_rw32(time_second, needswap);
+	cgp->cg_time = ufs_rw64(time_second, needswap);
+	cgbno = dtogd(fs, bno);
+	blksfree = cg_blksfree(cgp, needswap);
+
+	mutex_enter(&ump->um_lock);
+	if (size == fs->fs_bsize) {
+		fragno = fragstoblks(fs, cgbno);
+		if (!ffs_isblock(fs, blksfree, fragno)) {
+			mutex_exit(&ump->um_lock);
+			brelse(bp, 0);
+			return EBUSY;
+		}
+		ffs_clrblock(fs, blksfree, fragno);
+		ffs_clusteracct(fs, cgp, fragno, -1);
+		ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
+		fs->fs_cstotal.cs_nbfree--;
+		fs->fs_cs(fs, cg).cs_nbfree--;
+	} else {
+		bbase = cgbno - fragnum(fs, cgbno);
+
+		frags = numfrags(fs, size);
+		for (i = 0; i < frags; i++) {
+			if (isclr(blksfree, cgbno + i)) {
+				mutex_exit(&ump->um_lock);
+				brelse(bp, 0);
+				return EBUSY;
+			}
+		}
+		/*
+		 * if a complete block is being split, account for it
+		 */
+		fragno = fragstoblks(fs, bbase);
+		if (ffs_isblock(fs, blksfree, fragno)) {
+			ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap);
+			fs->fs_cstotal.cs_nffree += fs->fs_frag;
+			fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag;
+			ffs_clusteracct(fs, cgp, fragno, -1);
+			ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
+			fs->fs_cstotal.cs_nbfree--;
+			fs->fs_cs(fs, cg).cs_nbfree--;
+		}
+		/*
+		 * decrement the counts associated with the old frags
+		 */
+		blk = blkmap(fs, blksfree, bbase);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
+		/*
+		 * allocate the fragment
+		 */
+		for (i = 0; i < frags; i++) {
+			clrbit(blksfree, cgbno + i);
+		}
+		ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap);
+		fs->fs_cstotal.cs_nffree -= i;
+		fs->fs_cs(fs, cg).cs_nffree -= i;
+		/*
+		 * add back in counts associated with the new frags
+		 */
+		blk = blkmap(fs, blksfree, bbase);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
+	}
+	fs->fs_fmod = 1;
+	ACTIVECG_CLR(fs, cg);
+	mutex_exit(&ump->um_lock);
+	bdwrite(bp);
+	return 0;
+}
+
+/*
+ * Free a block or fragment.
+ *
+ * The specified block or fragment is placed back in the
+ * free map. If a fragment is deallocated, a possible
+ * block reassembly is checked.
+ *
+ * => um_lock not held on entry or exit
+ */
+void
+ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
+    ino_t inum)
+{
+	struct cg *cgp;
+	struct buf *bp;
+	struct ufsmount *ump;
+	daddr_t cgblkno;
+	int error, cg;
+	dev_t dev;
+	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+	KASSERT(!devvp_is_snapshot);
+
+	cg = dtog(fs, bno);
+	dev = devvp->v_rdev;
+	ump = VFSTOUFS(devvp->v_specmountpoint);
+	KASSERT(fs == ump->um_fs);
+	cgblkno = fsbtodb(fs, cgtod(fs, cg));
+	if (ffs_snapblkfree(fs, devvp, bno, size, inum))
+		return;
+
+	error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
+	if (error)
+		return;
+
+	error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
+	    NOCRED, B_MODIFY, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return;
+	}
+	cgp = (struct cg *)bp->b_data;
+	if (!cg_chkmagic(cgp, needswap)) {
+		brelse(bp, 0);
+		return;
+	}
+
+	ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
+
+	bdwrite(bp);
+}
+
+/*
+ * Free a block or fragment from a snapshot cg copy.
+ *
+ * The specified block or fragment is placed back in the
+ * free map. If a fragment is deallocated, a possible
+ * block reassembly is checked.
+ *
+ * => um_lock not held on entry or exit
+ */
+void
+ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
+    ino_t inum)
+{
+	struct cg *cgp;
+	struct buf *bp;
+	struct ufsmount *ump;
+	daddr_t cgblkno;
+	int error, cg;
+	dev_t dev;
+	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+	KASSERT(devvp_is_snapshot);
+
+	cg = dtog(fs, bno);
+	dev = VTOI(devvp)->i_devvp->v_rdev;
+	ump = VFSTOUFS(devvp->v_mount);
+	cgblkno = fragstoblks(fs, cgtod(fs, cg));
+
+	error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
+	if (error)
+		return;
+
+	error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
+	    NOCRED, B_MODIFY, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return;
+	}
+	cgp = (struct cg *)bp->b_data;
+	if (!cg_chkmagic(cgp, needswap)) {
+		brelse(bp, 0);
+		return;
+	}
+
+	ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
+
+	bdwrite(bp);
+}
+
+static void
+ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
+    struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot)
+{
+	struct cg *cgp;
+	int32_t fragno, cgbno;
+	int i, cg, blk, frags, bbase;
+	u_int8_t *blksfree;
+	const int needswap = UFS_FSNEEDSWAP(fs);
+
+	cg = dtog(fs, bno);
+	cgp = (struct cg *)bp->b_data;
+	cgp->cg_old_time = ufs_rw32(time_second, needswap);
+	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
+		cgp->cg_time = ufs_rw64(time_second, needswap);
+	cgbno = dtogd(fs, bno);
+	blksfree = cg_blksfree(cgp, needswap);
+	mutex_enter(&ump->um_lock);
+	if (size == fs->fs_bsize) {
+		fragno = fragstoblks(fs, cgbno);
+		if (!ffs_isfreeblock(fs, blksfree, fragno)) {
+			if (devvp_is_snapshot) {
+				mutex_exit(&ump->um_lock);
+				return;
+			}
+			printf("dev = 0x%llx, block = %" PRId64 ", fs = %s\n",
+			    (unsigned long long)dev, bno, fs->fs_fsmnt);
+			panic("blkfree: freeing free block");
+		}
+		ffs_setblock(fs, blksfree, fragno);
+		ffs_clusteracct(fs, cgp, fragno, 1);
+		ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
+		fs->fs_cstotal.cs_nbfree++;
+		fs->fs_cs(fs, cg).cs_nbfree++;
+		if ((fs->fs_magic == FS_UFS1_MAGIC) &&
+		    ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
+			i = old_cbtocylno(fs, cgbno);
+			KASSERT(i >= 0);
+			KASSERT(i < fs->fs_old_ncyl);
+			KASSERT(old_cbtorpos(fs, cgbno) >= 0);
+			KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos);
+			ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1,
+			    needswap);
+			ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
+		}
+	} else {
+		bbase = cgbno - fragnum(fs, cgbno);
+		/*
+		 * decrement the counts associated with the old frags
+		 */
+		blk = blkmap(fs, blksfree, bbase);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
+		/*
+		 * deallocate the fragment
+		 */
+		frags = numfrags(fs, size);
+		for (i = 0; i < frags; i++) {
+			if (isset(blksfree, cgbno + i)) {
+				printf("dev = 0x%llx, block = %" PRId64
+				       ", fs = %s\n",
+				    (unsigned long long)dev, bno + i,
+				    fs->fs_fsmnt);
+				panic("blkfree: freeing free frag");
+			}
+			setbit(blksfree, cgbno + i);
+		}
+		ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
+		fs->fs_cstotal.cs_nffree += i;
+		fs->fs_cs(fs, cg).cs_nffree += i;
+		/*
+		 * add back in counts associated with the new frags
+		 */
+		blk = blkmap(fs, blksfree, bbase);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
+		/*
+		 * if a complete block has been reassembled, account for it
+		 */
+		fragno = fragstoblks(fs, bbase);
+		if (ffs_isblock(fs, blksfree, fragno)) {
+			ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap);
+			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
+			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
+			ffs_clusteracct(fs, cgp, fragno, 1);
+			ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
+			fs->fs_cstotal.cs_nbfree++;
+			fs->fs_cs(fs, cg).cs_nbfree++;
+			if ((fs->fs_magic == FS_UFS1_MAGIC) &&
+			    ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
+				i = old_cbtocylno(fs, bbase);
+				KASSERT(i >= 0);
+				KASSERT(i < fs->fs_old_ncyl);
+				KASSERT(old_cbtorpos(fs, bbase) >= 0);
+				KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos);
+				ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs,
+				    bbase)], 1, needswap);
+				ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
+			}
+		}
+	}
+	fs->fs_fmod = 1;
+	ACTIVECG_CLR(fs, cg);
+	mutex_exit(&ump->um_lock);
+}
+
+/*
+ * Free an inode.
+ */
+int
+ffs_vfree(struct vnode *vp, ino_t ino, int mode)
+{
+
+	return ffs_freefile(vp->v_mount, ino, mode);
+}
+
+/*
+ * Do the actual free operation.
+ * The specified inode is placed back in the free map.
+ *
+ * => um_lock not held on entry or exit
+ */
+int
+ffs_freefile(struct mount *mp, ino_t ino, int mode)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	struct vnode *devvp;
+	struct cg *cgp;
+	struct buf *bp;
+	int error, cg;
+	daddr_t cgbno;
+	dev_t dev;
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+	cg = ino_to_cg(fs, ino);
+	devvp = ump->um_devvp;
+	dev = devvp->v_rdev;
+	cgbno = fsbtodb(fs, cgtod(fs, cg));
+
+	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
+		panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s",
+		    (long long)dev, (unsigned long long)ino, fs->fs_fsmnt);
+	error = bread(devvp, cgbno, (int)fs->fs_cgsize,
+	    NOCRED, B_MODIFY, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return (error);
+	}
+	cgp = (struct cg *)bp->b_data;
+	if (!cg_chkmagic(cgp, needswap)) {
+		brelse(bp, 0);
+		return (0);
+	}
+
+	ffs_freefile_common(ump, fs, dev, bp, ino, mode, false);
+
+	bdwrite(bp);
+
+	return 0;
+}
+
+int
+ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode)
+{
+	struct ufsmount *ump;
+	struct cg *cgp;
+	struct buf *bp;
+	int error, cg;
+	daddr_t cgbno;
+	dev_t dev;
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+	KASSERT(devvp->v_type != VBLK);
+
+	cg = ino_to_cg(fs, ino);
+	dev = VTOI(devvp)->i_devvp->v_rdev;
+	ump = VFSTOUFS(devvp->v_mount);
+	cgbno = fragstoblks(fs, cgtod(fs, cg));
+	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
+		panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s",
+		    (unsigned long long)dev, (unsigned long long)ino,
+		    fs->fs_fsmnt);
+	error = bread(devvp, cgbno, (int)fs->fs_cgsize,
+	    NOCRED, B_MODIFY, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return (error);
+	}
+	cgp = (struct cg *)bp->b_data;
+	if (!cg_chkmagic(cgp, needswap)) {
+		brelse(bp, 0);
+		return (0);
+	}
+	ffs_freefile_common(ump, fs, dev, bp, ino, mode, true);
+
+	bdwrite(bp);
+
+	return 0;
+}
+
+static void
+ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
+    struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot)
+{
+	int cg;
+	struct cg *cgp;
+	u_int8_t *inosused;
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+	cg = ino_to_cg(fs, ino);
+	cgp = (struct cg *)bp->b_data;
+	cgp->cg_old_time = ufs_rw32(time_second, needswap);
+	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
+		cgp->cg_time = ufs_rw64(time_second, needswap);
+	inosused = cg_inosused(cgp, needswap);
+	ino %= fs->fs_ipg;
+	if (isclr(inosused, ino)) {
+		printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n",
+		    (unsigned long long)dev, (unsigned long long)ino +
+		    cg * fs->fs_ipg, fs->fs_fsmnt);
+		if (fs->fs_ronly == 0)
+			panic("ifree: freeing free inode");
+	}
+	clrbit(inosused, ino);
+	if (!devvp_is_snapshot)
+		UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp,
+		    ino + cg * fs->fs_ipg, mode);
+	if (ino < ufs_rw32(cgp->cg_irotor, needswap))
+		cgp->cg_irotor = ufs_rw32(ino, needswap);
+	ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap);
+	mutex_enter(&ump->um_lock);
+	fs->fs_cstotal.cs_nifree++;
+	fs->fs_cs(fs, cg).cs_nifree++;
+	if ((mode & IFMT) == IFDIR) {
+		ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap);
+		fs->fs_cstotal.cs_ndir--;
+		fs->fs_cs(fs, cg).cs_ndir--;
+	}
+	fs->fs_fmod = 1;
+	ACTIVECG_CLR(fs, cg);
+	mutex_exit(&ump->um_lock);
+}
+
+/*
+ * Check to see if a file is free.
+ */
+int
+ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino)
+{
+	struct cg *cgp;
+	struct buf *bp;
+	daddr_t cgbno;
+	int ret, cg;
+	u_int8_t *inosused;
+	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
+
+	KASSERT(devvp_is_snapshot);
+
+	cg = ino_to_cg(fs, ino);
+	if (devvp_is_snapshot)
+		cgbno = fragstoblks(fs, cgtod(fs, cg));
+	else
+		cgbno = fsbtodb(fs, cgtod(fs, cg));
+	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
+		return 1;
+	if (bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, 0, &bp)) {
+		brelse(bp, 0);
+		return 1;
+	}
+	cgp = (struct cg *)bp->b_data;
+	if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
+		brelse(bp, 0);
+		return 1;
+	}
+	inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs));
+	ino %= fs->fs_ipg;
+	ret = isclr(inosused, ino);
+	brelse(bp, 0);
+	return ret;
+}
+
+/*
+ * Find a block of the specified size in the specified cylinder group.
+ *
+ * It is a panic if a request is made to find a block if none are
+ * available.
+ */
+static int32_t
+ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz)
+{
+	int32_t bno;
+	int start, len, loc, i;
+	int blk, field, subfield, pos;
+	int ostart, olen;
+	u_int8_t *blksfree;
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+	/* KASSERT(mutex_owned(&ump->um_lock)); */
+
+	/*
+	 * find the fragment by searching through the free block
+	 * map for an appropriate bit pattern
+	 */
+	if (bpref)
+		start = dtogd(fs, bpref) / NBBY;
+	else
+		start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY;
+	blksfree = cg_blksfree(cgp, needswap);
+	len = howmany(fs->fs_fpg, NBBY) - start;
+	ostart = start;
+	olen = len;
+	loc = scanc((u_int)len,
+		(const u_char *)&blksfree[start],
+		(const u_char *)fragtbl[fs->fs_frag],
+		(1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
+	if (loc == 0) {
+		len = start + 1;
+		start = 0;
+		loc = scanc((u_int)len,
+			(const u_char *)&blksfree[0],
+			(const u_char *)fragtbl[fs->fs_frag],
+			(1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
+		if (loc == 0) {
+			printf("start = %d, len = %d, fs = %s\n",
+			    ostart, olen, fs->fs_fsmnt);
+			printf("offset=%d %ld\n",
+				ufs_rw32(cgp->cg_freeoff, needswap),
+				(long)blksfree - (long)cgp);
+			printf("cg %d\n", cgp->cg_cgx);
+			panic("ffs_alloccg: map corrupted");
+			/* NOTREACHED */
+		}
+	}
+	bno = (start + len - loc) * NBBY;
+	cgp->cg_frotor = ufs_rw32(bno, needswap);
+	/*
+	 * found the byte in the map
+	 * sift through the bits to find the selected frag
+	 */
+	for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
+		blk = blkmap(fs, blksfree, bno);
+		blk <<= 1;
+		field = around[allocsiz];
+		subfield = inside[allocsiz];
+		for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
+			if ((blk & field) == subfield)
+				return (bno + pos);
+			field <<= 1;
+			subfield <<= 1;
+		}
+	}
+	printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt);
+	panic("ffs_alloccg: block not in map");
+	/* return (-1); */
+}
+
+/*
+ * Fserr prints the name of a file system with an error diagnostic.
+ *
+ * The form of the error message is:
+ *	fs: error message
+ */
+static void
+ffs_fserr(struct fs *fs, u_int uid, const char *cp)
+{
+
+	log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
+	    uid, curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp);
+}
diff --git a/sys/ufs/ffs/ffs_appleufs.c b/sys/ufs/ffs/ffs_appleufs.c
new file mode 100644
index 000000000..0067d40e9
--- /dev/null
+++ b/sys/ufs/ffs/ffs_appleufs.c
@@ -0,0 +1,154 @@
+/*	$NetBSD: ffs_appleufs.c,v 1.12 2011/11/19 22:51:31 tls Exp $	*/
+
+/*
+ * Copyright (c) 2002 Darrin B. Jewell
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_appleufs.c,v 1.12 2011/11/19 22:51:31 tls Exp $");
+
+#include <sys/param.h>
+#include <sys/time.h>
+#if defined(_KERNEL)
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/cprng.h>
+#endif
+
+#include <ufs/ufs/dinode.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#if !defined(_KERNEL) && !defined(STANDALONE)
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#define KASSERT(x) assert(x)
+#endif
+
+/*
+ * this is the same calculation as in_cksum
+ */
+u_int16_t
+ffs_appleufs_cksum(const struct appleufslabel *appleufs)
+{
+	const u_int16_t *p = (const u_int16_t *)appleufs;
+	int len = APPLEUFS_LABEL_SIZE; /* sizeof(struct appleufslabel) */
+	long res = 0;
+	while (len > 1)  {
+		res += *p++;
+		len -= 2;
+	}
+#if 0 /* APPLEUFS_LABEL_SIZE is guaranteed to be even */
+	if (len == 1)
+		res += htobe16(*(u_char *)p<<8);
+#endif
+	res = (res >> 16) + (res & 0xffff);
+	res += (res >> 16);
+	return (~res);
+}
+
+/* copies o to n, validating and byteswapping along the way
+ * returns 0 if ok, EINVAL if not valid
+ */
+int
+ffs_appleufs_validate(const char *name, const struct appleufslabel *o,
+    struct appleufslabel *n)
+{
+	struct appleufslabel tmp;
+	if (!n) n = &tmp;
+
+	if (o->ul_magic != be32toh(APPLEUFS_LABEL_MAGIC)) {
+		return EINVAL;
+	}
+	*n = *o;
+	n->ul_checksum = 0;
+	n->ul_checksum = ffs_appleufs_cksum(n);
+	if (n->ul_checksum != o->ul_checksum) {
+#if defined(DIAGNOSTIC) || !defined(_KERNEL)
+		printf("%s: invalid APPLE UFS checksum. found 0x%x, expecting 0x%x",
+			name,o->ul_checksum,n->ul_checksum);
+#endif
+		return EINVAL;
+	}
+	n->ul_magic = be32toh(o->ul_magic);
+	n->ul_version = be32toh(o->ul_version);
+	n->ul_time = be32toh(o->ul_time);
+	n->ul_namelen = be16toh(o->ul_namelen);
+
+	if (n->ul_namelen > APPLEUFS_MAX_LABEL_NAME) {
+#if defined(DIAGNOSTIC) || !defined(_KERNEL)
+		printf("%s: APPLE UFS label name too long, truncated.\n",
+				name);
+#endif
+		n->ul_namelen = APPLEUFS_MAX_LABEL_NAME;
+	}
+	/* if len is max, will set ul_unused1 */
+	n->ul_name[n->ul_namelen - 1] = '\0';
+
+#ifdef DEBUG
+	printf("%s: found APPLE UFS label v%d: \"%s\"\n",
+	    name,n->ul_version,n->ul_name);
+#endif
+	n->ul_uuid = be64toh(o->ul_uuid);
+
+	return 0;
+}
+
+void
+ffs_appleufs_set(struct appleufslabel *appleufs, const char *name, time_t t,
+    uint64_t uuid)
+{
+	size_t namelen;
+	if (!name) name = "untitled";
+	if (t == ((time_t)-1)) {
+#if defined(_KERNEL)
+		t = time_second;
+#elif defined(STANDALONE)
+		t = 0;
+#else
+		(void)time(&t);
+#endif
+	}
+	if (uuid == 0) {
+#if defined(_KERNEL) && !defined(STANDALONE)
+		uuid = cprng_fast64();
+#endif
+	}
+	namelen = strlen(name);
+	if (namelen > APPLEUFS_MAX_LABEL_NAME)
+		namelen = APPLEUFS_MAX_LABEL_NAME;
+	memset(appleufs, 0, APPLEUFS_LABEL_SIZE);
+	appleufs->ul_magic   = htobe32(APPLEUFS_LABEL_MAGIC);
+	appleufs->ul_version = htobe32(APPLEUFS_LABEL_VERSION);
+	appleufs->ul_time    = htobe32((u_int32_t)t);
+	appleufs->ul_namelen = htobe16(namelen);
+	strncpy(appleufs->ul_name,name,namelen);
+	appleufs->ul_uuid    = htobe64(uuid);
+	appleufs->ul_checksum = ffs_appleufs_cksum(appleufs);
+}
diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c
new file mode 100644
index 000000000..3683cbb19
--- /dev/null
+++ b/sys/ufs/ffs/ffs_balloc.c
@@ -0,0 +1,1051 @@
+/*	$NetBSD: ffs_balloc.c,v 1.54 2011/04/23 07:36:02 hannken Exp $	*/
+
+/*
+ * Copyright (c) 2002 Networks Associates Technology, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Marshall
+ * Kirk McKusick and Network Associates Laboratories, the Security
+ * Research Division of Network Associates, Inc. under DARPA/SPAWAR
+ * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
+ * research program
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_balloc.c,v 1.54 2011/04/23 07:36:02 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/kauth.h>
+#include <sys/fstrans.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#include <uvm/uvm.h>
+
+static int ffs_balloc_ufs1(struct vnode *, off_t, int, kauth_cred_t, int,
+    struct buf **);
+static int ffs_balloc_ufs2(struct vnode *, off_t, int, kauth_cred_t, int,
+    struct buf **);
+
+/*
+ * Balloc defines the structure of file system storage
+ * by allocating the physical blocks on a device given
+ * the inode and the logical block number in a file.
+ */
+
+int
+ffs_balloc(struct vnode *vp, off_t off, int size, kauth_cred_t cred, int flags,
+    struct buf **bpp)
+{
+	int error;
+
+	if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC)
+		error = ffs_balloc_ufs2(vp, off, size, cred, flags, bpp);
+	else
+		error = ffs_balloc_ufs1(vp, off, size, cred, flags, bpp);
+
+	if (error == 0 && bpp != NULL && (error = fscow_run(*bpp, false)) != 0)
+		brelse(*bpp, 0);
+
+	return error;
+}
+
+static int
+ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
+    int flags, struct buf **bpp)
+{
+	daddr_t lbn, lastlbn;
+	struct buf *bp, *nbp;
+	struct inode *ip = VTOI(vp);
+	struct fs *fs = ip->i_fs;
+	struct ufsmount *ump = ip->i_ump;
+	struct indir indirs[NIADDR + 2];
+	daddr_t newb, pref, nb;
+	int32_t *bap;	/* XXX ondisk32 */
+	int deallocated, osize, nsize, num, i, error;
+	int32_t *blkp, *allocblk, allociblk[NIADDR + 1];
+	int32_t *allocib;
+	int unwindidx = -1;
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+	UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist);
+
+	lbn = lblkno(fs, off);
+	size = blkoff(fs, off) + size;
+	if (size > fs->fs_bsize)
+		panic("ffs_balloc: blk too big");
+	if (bpp != NULL) {
+		*bpp = NULL;
+	}
+	UVMHIST_LOG(ubchist, "vp %p lbn 0x%x size 0x%x", vp, lbn, size,0);
+
+	if (lbn < 0)
+		return (EFBIG);
+
+	/*
+	 * If the next write will extend the file into a new block,
+	 * and the file is currently composed of a fragment
+	 * this fragment has to be extended to be a full block.
+	 */
+
+	lastlbn = lblkno(fs, ip->i_size);
+	if (lastlbn < NDADDR && lastlbn < lbn) {
+		nb = lastlbn;
+		osize = blksize(fs, ip, nb);
+		if (osize < fs->fs_bsize && osize > 0) {
+			mutex_enter(&ump->um_lock);
+			error = ffs_realloccg(ip, nb,
+				    ffs_blkpref_ufs1(ip, lastlbn, nb, flags,
+					&ip->i_ffs1_db[0]),
+				    osize, (int)fs->fs_bsize, cred, bpp, &newb);
+			if (error)
+				return (error);
+			ip->i_size = lblktosize(fs, nb + 1);
+			ip->i_ffs1_size = ip->i_size;
+			uvm_vnp_setsize(vp, ip->i_ffs1_size);
+			ip->i_ffs1_db[nb] = ufs_rw32((u_int32_t)newb, needswap);
+			ip->i_flag |= IN_CHANGE | IN_UPDATE;
+			if (bpp && *bpp) {
+				if (flags & B_SYNC)
+					bwrite(*bpp);
+				else
+					bawrite(*bpp);
+			}
+		}
+	}
+
+	/*
+	 * The first NDADDR blocks are direct blocks
+	 */
+
+	if (lbn < NDADDR) {
+		nb = ufs_rw32(ip->i_ffs1_db[lbn], needswap);
+		if (nb != 0 && ip->i_size >= lblktosize(fs, lbn + 1)) {
+
+			/*
+			 * The block is an already-allocated direct block
+			 * and the file already extends past this block,
+			 * thus this must be a whole block.
+			 * Just read the block (if requested).
+			 */
+
+			if (bpp != NULL) {
+				error = bread(vp, lbn, fs->fs_bsize, NOCRED,
+					      B_MODIFY, bpp);
+				if (error) {
+					brelse(*bpp, 0);
+					return (error);
+				}
+			}
+			return (0);
+		}
+		if (nb != 0) {
+
+			/*
+			 * Consider need to reallocate a fragment.
+			 */
+
+			osize = fragroundup(fs, blkoff(fs, ip->i_size));
+			nsize = fragroundup(fs, size);
+			if (nsize <= osize) {
+
+				/*
+				 * The existing block is already
+				 * at least as big as we want.
+				 * Just read the block (if requested).
+				 */
+
+				if (bpp != NULL) {
+					error = bread(vp, lbn, osize, NOCRED,
+						      B_MODIFY, bpp);
+					if (error) {
+						brelse(*bpp, 0);
+						return (error);
+					}
+				}
+				return 0;
+			} else {
+
+				/*
+				 * The existing block is smaller than we want,
+				 * grow it.
+				 */
+				mutex_enter(&ump->um_lock);
+				error = ffs_realloccg(ip, lbn,
+				    ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags,
+					&ip->i_ffs1_db[0]),
+				    osize, nsize, cred, bpp, &newb);
+				if (error)
+					return (error);
+			}
+		} else {
+
+			/*
+			 * the block was not previously allocated,
+			 * allocate a new block or fragment.
+			 */
+
+			if (ip->i_size < lblktosize(fs, lbn + 1))
+				nsize = fragroundup(fs, size);
+			else
+				nsize = fs->fs_bsize;
+			mutex_enter(&ump->um_lock);
+			error = ffs_alloc(ip, lbn,
+			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags,
+				&ip->i_ffs1_db[0]),
+			    nsize, flags, cred, &newb);
+			if (error)
+				return (error);
+			if (bpp != NULL) {
+				error = ffs_getblk(vp, lbn, fsbtodb(fs, newb),
+				    nsize, (flags & B_CLRBUF) != 0, bpp);
+				if (error)
+					return error;
+			}
+		}
+		ip->i_ffs1_db[lbn] = ufs_rw32((u_int32_t)newb, needswap);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		return (0);
+	}
+
+	/*
+	 * Determine the number of levels of indirection.
+	 */
+
+	pref = 0;
+	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
+		return (error);
+
+	/*
+	 * Fetch the first indirect block allocating if necessary.
+	 */
+
+	--num;
+	nb = ufs_rw32(ip->i_ffs1_ib[indirs[0].in_off], needswap);
+	allocib = NULL;
+	allocblk = allociblk;
+	if (nb == 0) {
+		mutex_enter(&ump->um_lock);
+		pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY, NULL);
+		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+		    flags | B_METAONLY, cred, &newb);
+		if (error)
+			goto fail;
+		nb = newb;
+		*allocblk++ = nb;
+		error = ffs_getblk(vp, indirs[1].in_lbn, fsbtodb(fs, nb),
+		    fs->fs_bsize, true, &bp);
+		if (error)
+			goto fail;
+		/*
+		 * Write synchronously so that indirect blocks
+		 * never point at garbage.
+		 */
+		if ((error = bwrite(bp)) != 0)
+			goto fail;
+		unwindidx = 0;
+		allocib = &ip->i_ffs1_ib[indirs[0].in_off];
+		*allocib = ufs_rw32(nb, needswap);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	}
+
+	/*
+	 * Fetch through the indirect blocks, allocating as necessary.
+	 */
+
+	for (i = 1;;) {
+		error = bread(vp,
+		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		bap = (int32_t *)bp->b_data;	/* XXX ondisk32 */
+		nb = ufs_rw32(bap[indirs[i].in_off], needswap);
+		if (i == num)
+			break;
+		i++;
+		if (nb != 0) {
+			brelse(bp, 0);
+			continue;
+		}
+		if (fscow_run(bp, true) != 0) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		mutex_enter(&ump->um_lock);
+		/* Try to keep snapshot indirect blocks contiguous. */
+		if (i == num && (ip->i_flags & SF_SNAPSHOT) != 0)
+			pref = ffs_blkpref_ufs1(ip, lbn, indirs[i-1].in_off,
+			    flags | B_METAONLY, &bap[0]);
+		if (pref == 0)
+			pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY,
+			    NULL);
+		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+		    flags | B_METAONLY, cred, &newb);
+		if (error) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		nb = newb;
+		*allocblk++ = nb;
+		error = ffs_getblk(vp, indirs[i].in_lbn, fsbtodb(fs, nb),
+		    fs->fs_bsize, true, &nbp);
+		if (error) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		/*
+		 * Write synchronously so that indirect blocks
+		 * never point at garbage.
+		 */
+		if ((error = bwrite(nbp)) != 0) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		if (unwindidx < 0)
+			unwindidx = i - 1;
+		bap[indirs[i - 1].in_off] = ufs_rw32(nb, needswap);
+
+		/*
+		 * If required, write synchronously, otherwise use
+		 * delayed write.
+		 */
+
+		if (flags & B_SYNC) {
+			bwrite(bp);
+		} else {
+			bdwrite(bp);
+		}
+	}
+
+	if (flags & B_METAONLY) {
+		KASSERT(bpp != NULL);
+		*bpp = bp;
+		return (0);
+	}
+
+	/*
+	 * Get the data block, allocating if necessary.
+	 */
+
+	if (nb == 0) {
+		if (fscow_run(bp, true) != 0) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		mutex_enter(&ump->um_lock);
+		pref = ffs_blkpref_ufs1(ip, lbn, indirs[num].in_off, flags,
+		    &bap[0]);
+		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred,
+		    &newb);
+		if (error) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		nb = newb;
+		*allocblk++ = nb;
+		if (bpp != NULL) {
+			error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
+			    fs->fs_bsize, (flags & B_CLRBUF) != 0, bpp);
+			if (error) {
+				brelse(bp, 0);
+				goto fail;
+			}
+		}
+		bap[indirs[num].in_off] = ufs_rw32(nb, needswap);
+		if (allocib == NULL && unwindidx < 0) {
+			unwindidx = i - 1;
+		}
+
+		/*
+		 * If required, write synchronously, otherwise use
+		 * delayed write.
+		 */
+
+		if (flags & B_SYNC) {
+			bwrite(bp);
+		} else {
+			bdwrite(bp);
+		}
+		return (0);
+	}
+	brelse(bp, 0);
+	if (bpp != NULL) {
+		if (flags & B_CLRBUF) {
+			error = bread(vp, lbn, (int)fs->fs_bsize,
+			    NOCRED, B_MODIFY, &nbp);
+			if (error) {
+				brelse(nbp, 0);
+				goto fail;
+			}
+		} else {
+			error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
+			    fs->fs_bsize, true, &nbp);
+			if (error)
+				goto fail;
+		}
+		*bpp = nbp;
+	}
+	return (0);
+
+fail:
+	/*
+	 * If we have failed part way through block allocation, we
+	 * have to deallocate any indirect blocks that we have allocated.
+	 */
+
+	if (unwindidx >= 0) {
+
+		/*
+		 * First write out any buffers we've created to resolve their
+		 * softdeps.  This must be done in reverse order of creation
+		 * so that we resolve the dependencies in one pass.
+		 * Write the cylinder group buffers for these buffers too.
+		 */
+
+		for (i = num; i >= unwindidx; i--) {
+			if (i == 0) {
+				break;
+			}
+			if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
+			    fs->fs_bsize, false, &bp) != 0)
+				continue;
+			if (bp->b_oflags & BO_DELWRI) {
+				nb = fsbtodb(fs, cgtod(fs, dtog(fs,
+				    dbtofsb(fs, bp->b_blkno))));
+				bwrite(bp);
+				if (ffs_getblk(ip->i_devvp, nb, FFS_NOBLK,
+				    fs->fs_cgsize, false, &bp) != 0)
+					continue;
+				if (bp->b_oflags & BO_DELWRI) {
+					bwrite(bp);
+				} else {
+					brelse(bp, BC_INVAL);
+				}
+			} else {
+				brelse(bp, BC_INVAL);
+			}
+		}
+
+		/*
+		 * Undo the partial allocation.
+		 */
+		if (unwindidx == 0) {
+			*allocib = 0;
+			ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		} else {
+			int r;
+
+			r = bread(vp, indirs[unwindidx].in_lbn,
+			    (int)fs->fs_bsize, NOCRED, 0, &bp);
+			if (r) {
+				panic("Could not unwind indirect block, error %d", r);
+				brelse(bp, 0);
+			} else {
+				bap = (int32_t *)bp->b_data; /* XXX ondisk32 */
+				bap[indirs[unwindidx].in_off] = 0;
+				bwrite(bp);
+			}
+		}
+		for (i = unwindidx + 1; i <= num; i++) {
+			if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
+			    fs->fs_bsize, false, &bp) == 0)
+				brelse(bp, BC_INVAL);
+		}
+	}
+	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
+		ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
+		deallocated += fs->fs_bsize;
+	}
+	if (deallocated) {
+#if defined(QUOTA) || defined(QUOTA2)
+		/*
+		 * Restore user's disk quota because allocation failed.
+		 */
+		(void)chkdq(ip, -btodb(deallocated), cred, FORCE);
+#endif
+		ip->i_ffs1_blocks -= btodb(deallocated);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	}
+	return (error);
+}
+
+static int
+ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
+    int flags, struct buf **bpp)
+{
+	daddr_t lbn, lastlbn;
+	struct buf *bp, *nbp;
+	struct inode *ip = VTOI(vp);
+	struct fs *fs = ip->i_fs;
+	struct ufsmount *ump = ip->i_ump;
+	struct indir indirs[NIADDR + 2];
+	daddr_t newb, pref, nb;
+	int64_t *bap;
+	int deallocated, osize, nsize, num, i, error;
+	daddr_t *blkp, *allocblk, allociblk[NIADDR + 1];
+	int64_t *allocib;
+	int unwindidx = -1;
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+	UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist);
+
+	lbn = lblkno(fs, off);
+	size = blkoff(fs, off) + size;
+	if (size > fs->fs_bsize)
+		panic("ffs_balloc: blk too big");
+	if (bpp != NULL) {
+		*bpp = NULL;
+	}
+	UVMHIST_LOG(ubchist, "vp %p lbn 0x%x size 0x%x", vp, lbn, size,0);
+
+	if (lbn < 0)
+		return (EFBIG);
+
+#ifdef notyet
+	/*
+	 * Check for allocating external data.
+	 */
+	if (flags & IO_EXT) {
+		if (lbn >= NXADDR)
+			return (EFBIG);
+		/*
+		 * If the next write will extend the data into a new block,
+		 * and the data is currently composed of a fragment
+		 * this fragment has to be extended to be a full block.
+		 */
+		lastlbn = lblkno(fs, dp->di_extsize);
+		if (lastlbn < lbn) {
+			nb = lastlbn;
+			osize = sblksize(fs, dp->di_extsize, nb);
+			if (osize < fs->fs_bsize && osize > 0) {
+				mutex_enter(&ump->um_lock);
+				error = ffs_realloccg(ip, -1 - nb,
+				    dp->di_extb[nb],
+				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
+					flags, &dp->di_extb[0]),
+				    osize,
+				    (int)fs->fs_bsize, cred, &bp);
+				if (error)
+					return (error);
+				dp->di_extsize = smalllblktosize(fs, nb + 1);
+				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
+				bp->b_xflags |= BX_ALTDATA;
+				ip->i_flag |= IN_CHANGE | IN_UPDATE;
+				if (flags & IO_SYNC)
+					bwrite(bp);
+				else
+					bawrite(bp);
+			}
+		}
+		/*
+		 * All blocks are direct blocks
+		 */
+		if (flags & BA_METAONLY)
+			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
+		nb = dp->di_extb[lbn];
+		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
+			error = bread(vp, -1 - lbn, fs->fs_bsize,
+			    NOCRED, 0, &bp);
+			if (error) {
+				brelse(bp, 0);
+				return (error);
+			}
+			mutex_enter(&bp->b_interlock);
+			bp->b_blkno = fsbtodb(fs, nb);
+			bp->b_xflags |= BX_ALTDATA;
+			mutex_exit(&bp->b_interlock);
+			*bpp = bp;
+			return (0);
+		}
+		if (nb != 0) {
+			/*
+			 * Consider need to reallocate a fragment.
+			 */
+			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
+			nsize = fragroundup(fs, size);
+			if (nsize <= osize) {
+				error = bread(vp, -1 - lbn, osize,
+				    NOCRED, 0, &bp);
+				if (error) {
+					brelse(bp, 0);
+					return (error);
+				}
+				mutex_enter(&bp->b_interlock);
+				bp->b_blkno = fsbtodb(fs, nb);
+				bp->b_xflags |= BX_ALTDATA;
+				mutex_exit(&bp->b_interlock);
+			} else {
+				mutex_enter(&ump->um_lock);
+				error = ffs_realloccg(ip, -1 - lbn,
+				    dp->di_extb[lbn],
+				    ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+				        &dp->di_extb[0]),
+				    osize, nsize, cred, &bp);
+				if (error)
+					return (error);
+				bp->b_xflags |= BX_ALTDATA;
+			}
+		} else {
+			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
+				nsize = fragroundup(fs, size);
+			else
+				nsize = fs->fs_bsize;
+			mutex_enter(&ump->um_lock);
+			error = ffs_alloc(ip, lbn,
+			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+			       &dp->di_extb[0]),
+			   nsize, flags, cred, &newb);
+			if (error)
+				return (error);
+			error = ffs_getblk(vp, -1 - lbn, fsbtodb(fs, newb),
+			    nsize, (flags & BA_CLRBUF) != 0, &bp);
+			if (error)
+				return error;
+			bp->b_xflags |= BX_ALTDATA;
+		}
+		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		*bpp = bp;
+		return (0);
+	}
+#endif
+	/*
+	 * If the next write will extend the file into a new block,
+	 * and the file is currently composed of a fragment
+	 * this fragment has to be extended to be a full block.
+	 */
+
+	lastlbn = lblkno(fs, ip->i_size);
+	if (lastlbn < NDADDR && lastlbn < lbn) {
+		nb = lastlbn;
+		osize = blksize(fs, ip, nb);
+		if (osize < fs->fs_bsize && osize > 0) {
+			mutex_enter(&ump->um_lock);
+			error = ffs_realloccg(ip, nb,
+				    ffs_blkpref_ufs2(ip, lastlbn, nb, flags,
+					&ip->i_ffs2_db[0]),
+				    osize, (int)fs->fs_bsize, cred, bpp, &newb);
+			if (error)
+				return (error);
+			ip->i_size = lblktosize(fs, nb + 1);
+			ip->i_ffs2_size = ip->i_size;
+			uvm_vnp_setsize(vp, ip->i_size);
+			ip->i_ffs2_db[nb] = ufs_rw64(newb, needswap);
+			ip->i_flag |= IN_CHANGE | IN_UPDATE;
+			if (bpp) {
+				if (flags & B_SYNC)
+					bwrite(*bpp);
+				else
+					bawrite(*bpp);
+			}
+		}
+	}
+
+	/*
+	 * The first NDADDR blocks are direct blocks
+	 */
+
+	if (lbn < NDADDR) {
+		nb = ufs_rw64(ip->i_ffs2_db[lbn], needswap);
+		if (nb != 0 && ip->i_size >= lblktosize(fs, lbn + 1)) {
+
+			/*
+			 * The block is an already-allocated direct block
+			 * and the file already extends past this block,
+			 * thus this must be a whole block.
+			 * Just read the block (if requested).
+			 */
+
+			if (bpp != NULL) {
+				error = bread(vp, lbn, fs->fs_bsize, NOCRED,
+					      B_MODIFY, bpp);
+				if (error) {
+					brelse(*bpp, 0);
+					return (error);
+				}
+			}
+			return (0);
+		}
+		if (nb != 0) {
+
+			/*
+			 * Consider need to reallocate a fragment.
+			 */
+
+			osize = fragroundup(fs, blkoff(fs, ip->i_size));
+			nsize = fragroundup(fs, size);
+			if (nsize <= osize) {
+
+				/*
+				 * The existing block is already
+				 * at least as big as we want.
+				 * Just read the block (if requested).
+				 */
+
+				if (bpp != NULL) {
+					error = bread(vp, lbn, osize, NOCRED,
+						      B_MODIFY, bpp);
+					if (error) {
+						brelse(*bpp, 0);
+						return (error);
+					}
+				}
+				return 0;
+			} else {
+
+				/*
+				 * The existing block is smaller than we want,
+				 * grow it.
+				 */
+				mutex_enter(&ump->um_lock);
+				error = ffs_realloccg(ip, lbn,
+				    ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+					&ip->i_ffs2_db[0]),
+				    osize, nsize, cred, bpp, &newb);
+				if (error)
+					return (error);
+			}
+		} else {
+
+			/*
+			 * the block was not previously allocated,
+			 * allocate a new block or fragment.
+			 */
+
+			if (ip->i_size < lblktosize(fs, lbn + 1))
+				nsize = fragroundup(fs, size);
+			else
+				nsize = fs->fs_bsize;
+			mutex_enter(&ump->um_lock);
+			error = ffs_alloc(ip, lbn,
+			    ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+				&ip->i_ffs2_db[0]),
+			    nsize, flags, cred, &newb);
+			if (error)
+				return (error);
+			if (bpp != NULL) {
+				error = ffs_getblk(vp, lbn, fsbtodb(fs, newb),
+				    nsize, (flags & B_CLRBUF) != 0, bpp);
+				if (error)
+					return error;
+			}
+		}
+		ip->i_ffs2_db[lbn] = ufs_rw64(newb, needswap);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		return (0);
+	}
+
+	/*
+	 * Determine the number of levels of indirection.
+	 */
+
+	pref = 0;
+	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
+		return (error);
+
+	/*
+	 * Fetch the first indirect block allocating if necessary.
+	 */
+
+	--num;
+	nb = ufs_rw64(ip->i_ffs2_ib[indirs[0].in_off], needswap);
+	allocib = NULL;
+	allocblk = allociblk;
+	if (nb == 0) {
+		mutex_enter(&ump->um_lock);
+		pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY, NULL);
+		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+		    flags | B_METAONLY, cred, &newb);
+		if (error)
+			goto fail;
+		nb = newb;
+		*allocblk++ = nb;
+		error = ffs_getblk(vp, indirs[1].in_lbn, fsbtodb(fs, nb),
+		    fs->fs_bsize, true, &bp);
+		if (error)
+			goto fail;
+		/*
+		 * Write synchronously so that indirect blocks
+		 * never point at garbage.
+		 */
+		if ((error = bwrite(bp)) != 0)
+			goto fail;
+		unwindidx = 0;
+		allocib = &ip->i_ffs2_ib[indirs[0].in_off];
+		*allocib = ufs_rw64(nb, needswap);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	}
+
+	/*
+	 * Fetch through the indirect blocks, allocating as necessary.
+	 */
+
+	for (i = 1;;) {
+		error = bread(vp,
+		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		bap = (int64_t *)bp->b_data;
+		nb = ufs_rw64(bap[indirs[i].in_off], needswap);
+		if (i == num)
+			break;
+		i++;
+		if (nb != 0) {
+			brelse(bp, 0);
+			continue;
+		}
+		if (fscow_run(bp, true) != 0) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		mutex_enter(&ump->um_lock);
+		/* Try to keep snapshot indirect blocks contiguous. */
+		if (i == num && (ip->i_flags & SF_SNAPSHOT) != 0)
+			pref = ffs_blkpref_ufs2(ip, lbn, indirs[i-1].in_off,
+			    flags | B_METAONLY, &bap[0]);
+		if (pref == 0)
+			pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY,
+			    NULL);
+		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+		    flags | B_METAONLY, cred, &newb);
+		if (error) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		nb = newb;
+		*allocblk++ = nb;
+		error = ffs_getblk(vp, indirs[i].in_lbn, fsbtodb(fs, nb),
+		    fs->fs_bsize, true, &nbp);
+		if (error) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		/*
+		 * Write synchronously so that indirect blocks
+		 * never point at garbage.
+		 */
+		if ((error = bwrite(nbp)) != 0) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		if (unwindidx < 0)
+			unwindidx = i - 1;
+		bap[indirs[i - 1].in_off] = ufs_rw64(nb, needswap);
+
+		/*
+		 * If required, write synchronously, otherwise use
+		 * delayed write.
+		 */
+
+		if (flags & B_SYNC) {
+			bwrite(bp);
+		} else {
+			bdwrite(bp);
+		}
+	}
+
+	if (flags & B_METAONLY) {
+		KASSERT(bpp != NULL);
+		*bpp = bp;
+		return (0);
+	}
+
+	/*
+	 * Get the data block, allocating if necessary.
+	 */
+
+	if (nb == 0) {
+		if (fscow_run(bp, true) != 0) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		mutex_enter(&ump->um_lock);
+		pref = ffs_blkpref_ufs2(ip, lbn, indirs[num].in_off, flags,
+		    &bap[0]);
+		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred,
+		    &newb);
+		if (error) {
+			brelse(bp, 0);
+			goto fail;
+		}
+		nb = newb;
+		*allocblk++ = nb;
+		if (bpp != NULL) {
+			error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
+			    fs->fs_bsize, (flags & B_CLRBUF) != 0, bpp);
+			if (error) {
+				brelse(bp, 0);
+				goto fail;
+			}
+		}
+		bap[indirs[num].in_off] = ufs_rw64(nb, needswap);
+		if (allocib == NULL && unwindidx < 0) {
+			unwindidx = i - 1;
+		}
+
+		/*
+		 * If required, write synchronously, otherwise use
+		 * delayed write.
+		 */
+
+		if (flags & B_SYNC) {
+			bwrite(bp);
+		} else {
+			bdwrite(bp);
+		}
+		return (0);
+	}
+	brelse(bp, 0);
+	if (bpp != NULL) {
+		if (flags & B_CLRBUF) {
+			error = bread(vp, lbn, (int)fs->fs_bsize,
+			    NOCRED, B_MODIFY, &nbp);
+			if (error) {
+				brelse(nbp, 0);
+				goto fail;
+			}
+		} else {
+			error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
+			    fs->fs_bsize, true, &nbp);
+			if (error)
+				goto fail;
+		}
+		*bpp = nbp;
+	}
+	return (0);
+
+fail:
+	/*
+	 * If we have failed part way through block allocation, we
+	 * have to deallocate any indirect blocks that we have allocated.
+	 */
+
+	if (unwindidx >= 0) {
+
+		/*
+		 * First write out any buffers we've created to resolve their
+		 * softdeps.  This must be done in reverse order of creation
+		 * so that we resolve the dependencies in one pass.
+		 * Write the cylinder group buffers for these buffers too.
+		 */
+
+		for (i = num; i >= unwindidx; i--) {
+			if (i == 0) {
+				break;
+			}
+			if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
+			    fs->fs_bsize, false, &bp) != 0)
+				continue;
+			if (bp->b_oflags & BO_DELWRI) {
+				nb = fsbtodb(fs, cgtod(fs, dtog(fs,
+				    dbtofsb(fs, bp->b_blkno))));
+				bwrite(bp);
+				if (ffs_getblk(ip->i_devvp, nb, FFS_NOBLK,
+				    fs->fs_cgsize, false, &bp) != 0)
+					continue;
+				if (bp->b_oflags & BO_DELWRI) {
+					bwrite(bp);
+				} else {
+					brelse(bp, BC_INVAL);
+				}
+			} else {
+				brelse(bp, BC_INVAL);
+			}
+		}
+
+		/*
+		 * Now that any dependencies that we created have been
+		 * resolved, we can undo the partial allocation.
+		 */
+
+		if (unwindidx == 0) {
+			*allocib = 0;
+			ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		} else {
+			int r;
+
+			r = bread(vp, indirs[unwindidx].in_lbn,
+			    (int)fs->fs_bsize, NOCRED, 0, &bp);
+			if (r) {
+				panic("Could not unwind indirect block, error %d", r);
+				brelse(bp, 0);
+			} else {
+				bap = (int64_t *)bp->b_data;
+				bap[indirs[unwindidx].in_off] = 0;
+				bwrite(bp);
+			}
+		}
+		for (i = unwindidx + 1; i <= num; i++) {
+			if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
+			    fs->fs_bsize, false, &bp) == 0)
+				brelse(bp, BC_INVAL);
+		}
+	}
+	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
+		ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
+		deallocated += fs->fs_bsize;
+	}
+	if (deallocated) {
+#if defined(QUOTA) || defined(QUOTA2)
+		/*
+		 * Restore user's disk quota because allocation failed.
+		 */
+		(void)chkdq(ip, -btodb(deallocated), cred, FORCE);
+#endif
+		ip->i_ffs2_blocks -= btodb(deallocated);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	}
+
+	return (error);
+}
diff --git a/sys/ufs/ffs/ffs_bswap.c b/sys/ufs/ffs/ffs_bswap.c
new file mode 100644
index 000000000..ddac30db0
--- /dev/null
+++ b/sys/ufs/ffs/ffs_bswap.c
@@ -0,0 +1,271 @@
+/*	$NetBSD: ffs_bswap.c,v 1.35 2011/03/06 17:08:38 bouyer Exp $	*/
+
+/*
+ * Copyright (c) 1998 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#if HAVE_NBTOOL_CONFIG_H
+#include "nbtool_config.h"
+#endif
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_bswap.c,v 1.35 2011/03/06 17:08:38 bouyer Exp $");
+
+#include <sys/param.h>
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#endif
+
+#include <ufs/ufs/dinode.h>
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#if !defined(_KERNEL)
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define panic(x)	printf("%s\n", (x)), abort()
+#endif
+
+void
+ffs_sb_swap(struct fs *o, struct fs *n)
+{
+	size_t i;
+	u_int32_t *o32, *n32;
+
+	/*
+	 * In order to avoid a lot of lines, as the first N fields (52)
+	 * of the superblock up to fs_fmod are u_int32_t, we just loop
+	 * here to convert them.
+	 */
+	o32 = (u_int32_t *)o;
+	n32 = (u_int32_t *)n;
+	for (i = 0; i < offsetof(struct fs, fs_fmod) / sizeof(u_int32_t); i++)
+		n32[i] = bswap32(o32[i]);
+
+	n->fs_swuid = bswap64(o->fs_swuid);
+	n->fs_cgrotor = bswap32(o->fs_cgrotor); /* Unused */
+	n->fs_old_cpc = bswap32(o->fs_old_cpc);
+
+	/* These fields overlap with a possible location for the
+	 * historic FS_DYNAMICPOSTBLFMT postbl table, and with the
+	 * first half of the historic FS_42POSTBLFMT postbl table.
+	 */
+	n->fs_maxbsize = bswap32(o->fs_maxbsize);
+	/* XXX journal */
+	n->fs_quota_magic = bswap32(o->fs_quota_magic);
+	for (i = 0; i < MAXQUOTAS; i++)
+		n->fs_quotafile[i] = bswap64(o->fs_quotafile[i]);
+	n->fs_sblockloc = bswap64(o->fs_sblockloc);
+	ffs_csumtotal_swap(&o->fs_cstotal, &n->fs_cstotal);
+	n->fs_time = bswap64(o->fs_time);
+	n->fs_size = bswap64(o->fs_size);
+	n->fs_dsize = bswap64(o->fs_dsize);
+	n->fs_csaddr = bswap64(o->fs_csaddr);
+	n->fs_pendingblocks = bswap64(o->fs_pendingblocks);
+	n->fs_pendinginodes = bswap32(o->fs_pendinginodes);
+
+	/* These fields overlap with the second half of the
+	 * historic FS_42POSTBLFMT postbl table
+	 */
+	for (i = 0; i < FSMAXSNAP; i++)
+		n->fs_snapinum[i] = bswap32(o->fs_snapinum[i]);
+	n->fs_avgfilesize = bswap32(o->fs_avgfilesize);
+	n->fs_avgfpdir = bswap32(o->fs_avgfpdir);
+	/* fs_sparecon[28] - ignore for now */
+	n->fs_flags = bswap32(o->fs_flags);
+	n->fs_contigsumsize = bswap32(o->fs_contigsumsize);
+	n->fs_maxsymlinklen = bswap32(o->fs_maxsymlinklen);
+	n->fs_old_inodefmt = bswap32(o->fs_old_inodefmt);
+	n->fs_maxfilesize = bswap64(o->fs_maxfilesize);
+	n->fs_qbmask = bswap64(o->fs_qbmask);
+	n->fs_qfmask = bswap64(o->fs_qfmask);
+	n->fs_state = bswap32(o->fs_state);
+	n->fs_old_postblformat = bswap32(o->fs_old_postblformat);
+	n->fs_old_nrpos = bswap32(o->fs_old_nrpos);
+	n->fs_old_postbloff = bswap32(o->fs_old_postbloff);
+	n->fs_old_rotbloff = bswap32(o->fs_old_rotbloff);
+
+	n->fs_magic = bswap32(o->fs_magic);
+}
+
+void
+ffs_dinode1_swap(struct ufs1_dinode *o, struct ufs1_dinode *n)
+{
+
+	n->di_mode = bswap16(o->di_mode);
+	n->di_nlink = bswap16(o->di_nlink);
+	n->di_u.oldids[0] = bswap16(o->di_u.oldids[0]);
+	n->di_u.oldids[1] = bswap16(o->di_u.oldids[1]);
+	n->di_size = bswap64(o->di_size);
+	n->di_atime = bswap32(o->di_atime);
+	n->di_atimensec = bswap32(o->di_atimensec);
+	n->di_mtime = bswap32(o->di_mtime);
+	n->di_mtimensec = bswap32(o->di_mtimensec);
+	n->di_ctime = bswap32(o->di_ctime);
+	n->di_ctimensec = bswap32(o->di_ctimensec);
+	memcpy(n->di_db, o->di_db, (NDADDR + NIADDR) * sizeof(u_int32_t));
+	n->di_flags = bswap32(o->di_flags);
+	n->di_blocks = bswap32(o->di_blocks);
+	n->di_gen = bswap32(o->di_gen);
+	n->di_uid = bswap32(o->di_uid);
+	n->di_gid = bswap32(o->di_gid);
+}
+
+void
+ffs_dinode2_swap(struct ufs2_dinode *o, struct ufs2_dinode *n)
+{
+	n->di_mode = bswap16(o->di_mode);
+	n->di_nlink = bswap16(o->di_nlink);
+	n->di_uid = bswap32(o->di_uid);
+	n->di_gid = bswap32(o->di_gid);
+	n->di_blksize = bswap32(o->di_blksize);
+	n->di_size = bswap64(o->di_size);
+	n->di_blocks = bswap64(o->di_blocks);
+	n->di_atime = bswap64(o->di_atime);
+	n->di_atimensec = bswap32(o->di_atimensec);
+	n->di_mtime = bswap64(o->di_mtime);
+	n->di_mtimensec = bswap32(o->di_mtimensec);
+	n->di_ctime = bswap64(o->di_ctime);
+	n->di_ctimensec = bswap32(o->di_ctimensec);
+	n->di_birthtime = bswap64(o->di_birthtime);
+	n->di_birthnsec = bswap32(o->di_birthnsec);
+	n->di_gen = bswap32(o->di_gen);
+	n->di_kernflags = bswap32(o->di_kernflags);
+	n->di_flags = bswap32(o->di_flags);
+	n->di_extsize = bswap32(o->di_extsize);
+	memcpy(n->di_extb, o->di_extb, (NXADDR + NDADDR + NIADDR) * 8);
+}
+
+void
+ffs_csum_swap(struct csum *o, struct csum *n, int size)
+{
+	size_t i;
+	u_int32_t *oint, *nint;
+
+	oint = (u_int32_t*)o;
+	nint = (u_int32_t*)n;
+
+	for (i = 0; i < size / sizeof(u_int32_t); i++)
+		nint[i] = bswap32(oint[i]);
+}
+
+void
+ffs_csumtotal_swap(struct csum_total *o, struct csum_total *n)
+{
+	n->cs_ndir = bswap64(o->cs_ndir);
+	n->cs_nbfree = bswap64(o->cs_nbfree);
+	n->cs_nifree = bswap64(o->cs_nifree);
+	n->cs_nffree = bswap64(o->cs_nffree);
+}
+
+/*
+ * Note that ffs_cg_swap may be called with o == n.
+ */
+void
+ffs_cg_swap(struct cg *o, struct cg *n, struct fs *fs)
+{
+	int i;
+	u_int32_t *n32, *o32;
+	u_int16_t *n16, *o16;
+	int32_t btotoff, boff, clustersumoff;
+
+	n->cg_firstfield = bswap32(o->cg_firstfield);
+	n->cg_magic = bswap32(o->cg_magic);
+	n->cg_old_time = bswap32(o->cg_old_time);
+	n->cg_cgx = bswap32(o->cg_cgx);
+	n->cg_old_ncyl = bswap16(o->cg_old_ncyl);
+	n->cg_old_niblk = bswap16(o->cg_old_niblk);
+	n->cg_ndblk = bswap32(o->cg_ndblk);
+	n->cg_cs.cs_ndir = bswap32(o->cg_cs.cs_ndir);
+	n->cg_cs.cs_nbfree = bswap32(o->cg_cs.cs_nbfree);
+	n->cg_cs.cs_nifree = bswap32(o->cg_cs.cs_nifree);
+	n->cg_cs.cs_nffree = bswap32(o->cg_cs.cs_nffree);
+	n->cg_rotor = bswap32(o->cg_rotor);
+	n->cg_frotor = bswap32(o->cg_frotor);
+	n->cg_irotor = bswap32(o->cg_irotor);
+	for (i = 0; i < MAXFRAG; i++)
+		n->cg_frsum[i] = bswap32(o->cg_frsum[i]);
+
+	if ((fs->fs_magic != FS_UFS2_MAGIC) &&
+			(fs->fs_old_postblformat == FS_42POSTBLFMT)) { /* old format */
+		struct ocg *on, *oo;
+		int j;
+		on = (struct ocg *)n;
+		oo = (struct ocg *)o;
+
+		for (i = 0; i < 32; i++) {
+			on->cg_btot[i] = bswap32(oo->cg_btot[i]);
+			for (j = 0; j < 8; j++)
+				on->cg_b[i][j] = bswap16(oo->cg_b[i][j]);
+		}
+		memmove(on->cg_iused, oo->cg_iused, 256);
+		on->cg_magic = bswap32(oo->cg_magic);
+	} else {  /* new format */
+
+		n->cg_old_btotoff = bswap32(o->cg_old_btotoff);
+		n->cg_old_boff = bswap32(o->cg_old_boff);
+		n->cg_iusedoff = bswap32(o->cg_iusedoff);
+		n->cg_freeoff = bswap32(o->cg_freeoff);
+		n->cg_nextfreeoff = bswap32(o->cg_nextfreeoff);
+		n->cg_clustersumoff = bswap32(o->cg_clustersumoff);
+		n->cg_clusteroff = bswap32(o->cg_clusteroff);
+		n->cg_nclusterblks = bswap32(o->cg_nclusterblks);
+		n->cg_niblk = bswap32(o->cg_niblk);
+		n->cg_initediblk = bswap32(o->cg_initediblk);
+		n->cg_time = bswap64(o->cg_time);
+
+		if (n->cg_magic == CG_MAGIC) {
+			btotoff = n->cg_old_btotoff;
+			boff = n->cg_old_boff;
+			clustersumoff = n->cg_clustersumoff;
+		} else {
+			btotoff = bswap32(n->cg_old_btotoff);
+			boff = bswap32(n->cg_old_boff);
+			clustersumoff = bswap32(n->cg_clustersumoff);
+		}
+
+		n32 = (u_int32_t *)((u_int8_t *)n + clustersumoff);
+		o32 = (u_int32_t *)((u_int8_t *)o + clustersumoff);
+		for (i = 1; i < fs->fs_contigsumsize + 1; i++)
+			n32[i] = bswap32(o32[i]);
+
+		if (fs->fs_magic == FS_UFS2_MAGIC)
+			return;
+
+		n32 = (u_int32_t *)((u_int8_t *)n + btotoff);
+		o32 = (u_int32_t *)((u_int8_t *)o + btotoff);
+		n16 = (u_int16_t *)((u_int8_t *)n + boff);
+		o16 = (u_int16_t *)((u_int8_t *)o + boff);
+
+		for (i = 0; i < fs->fs_old_cpg; i++)
+			n32[i] = bswap32(o32[i]);
+
+		for (i = 0; i < fs->fs_old_cpg * fs->fs_old_nrpos; i++)
+			n16[i] = bswap16(o16[i]);
+	}
+}
diff --git a/include/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h
similarity index 100%
rename from include/ufs/ffs/ffs_extern.h
rename to sys/ufs/ffs/ffs_extern.h
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
new file mode 100644
index 000000000..0f6edcb7d
--- /dev/null
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -0,0 +1,725 @@
+/*	$NetBSD: ffs_inode.c,v 1.108 2011/11/23 19:42:10 bouyer Exp $	*/
+
+/*-
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ffs_inode.c	8.13 (Berkeley) 4/21/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_inode.c,v 1.108 2011/11/23 19:42:10 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/fstrans.h>
+#include <sys/kauth.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/trace.h>
+#include <sys/vnode.h>
+#include <sys/wapbl.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+static int ffs_indirtrunc(struct inode *, daddr_t, daddr_t, daddr_t, int,
+			  int64_t *);
+
+/*
+ * Update the access, modified, and inode change times as specified
+ * by the IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively.
+ * The IN_MODIFIED flag is used to specify that the inode needs to be
+ * updated but that the times have already been set. The access
+ * and modified times are taken from the second and third parameters;
+ * the inode change time is always taken from the current time. If
+ * UPDATE_WAIT flag is set, or UPDATE_DIROP is set then wait for the
+ * disk write of the inode to complete.
+ */
+
+int
+ffs_update(struct vnode *vp, const struct timespec *acc,
+    const struct timespec *mod, int updflags)
+{
+	struct fs *fs;
+	struct buf *bp;
+	struct inode *ip;
+	int error;
+	void *cp;
+	int waitfor, flags;
+
+	if (vp->v_mount->mnt_flag & MNT_RDONLY)
+		return (0);
+	ip = VTOI(vp);
+	FFS_ITIMES(ip, acc, mod, NULL);
+	if (updflags & UPDATE_CLOSE)
+		flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED);
+	else
+		flags = ip->i_flag & IN_MODIFIED;
+	if (flags == 0)
+		return (0);
+	fs = ip->i_fs;
+
+	if ((flags & IN_MODIFIED) != 0 &&
+	    (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) {
+		waitfor = updflags & UPDATE_WAIT;
+		if ((updflags & UPDATE_DIROP) != 0)
+			waitfor |= UPDATE_WAIT;
+	} else
+		waitfor = 0;
+
+	/*
+	 * Ensure that uid and gid are correct. This is a temporary
+	 * fix until fsck has been changed to do the update.
+	 */
+	if (fs->fs_magic == FS_UFS1_MAGIC &&			/* XXX */
+	    fs->fs_old_inodefmt < FS_44INODEFMT) {		/* XXX */
+		ip->i_ffs1_ouid = ip->i_uid;	/* XXX */
+		ip->i_ffs1_ogid = ip->i_gid;	/* XXX */
+	}							/* XXX */
+	error = bread(ip->i_devvp,
+		      fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+		      (int)fs->fs_bsize, NOCRED, B_MODIFY, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return (error);
+	}
+	ip->i_flag &= ~(IN_MODIFIED | IN_ACCESSED);
+	/* Keep unlinked inode list up to date */
+	KDASSERT(DIP(ip, nlink) == ip->i_nlink);
+	if (ip->i_mode) {
+		if (ip->i_nlink > 0) {
+			UFS_WAPBL_UNREGISTER_INODE(ip->i_ump->um_mountp,
+			    ip->i_number, ip->i_mode);
+		} else {
+			UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp,
+			    ip->i_number, ip->i_mode);
+		}
+	}
+	if (fs->fs_magic == FS_UFS1_MAGIC) {
+		cp = (char *)bp->b_data +
+		    (ino_to_fsbo(fs, ip->i_number) * DINODE1_SIZE);
+#ifdef FFS_EI
+		if (UFS_FSNEEDSWAP(fs))
+			ffs_dinode1_swap(ip->i_din.ffs1_din,
+			    (struct ufs1_dinode *)cp);
+		else
+#endif
+			memcpy(cp, ip->i_din.ffs1_din, DINODE1_SIZE);
+	} else {
+		cp = (char *)bp->b_data +
+		    (ino_to_fsbo(fs, ip->i_number) * DINODE2_SIZE);
+#ifdef FFS_EI
+		if (UFS_FSNEEDSWAP(fs))
+			ffs_dinode2_swap(ip->i_din.ffs2_din,
+			    (struct ufs2_dinode *)cp);
+		else
+#endif
+			memcpy(cp, ip->i_din.ffs2_din, DINODE2_SIZE);
+	}
+	if (waitfor) {
+		return (bwrite(bp));
+	} else {
+		bdwrite(bp);
+		return (0);
+	}
+}
+
+#define	SINGLE	0	/* index of single indirect block */
+#define	DOUBLE	1	/* index of double indirect block */
+#define	TRIPLE	2	/* index of triple indirect block */
+/*
+ * Truncate the inode oip to at most length size, freeing the
+ * disk blocks.
+ */
+int
+ffs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred)
+{
+	daddr_t lastblock;
+	struct inode *oip = VTOI(ovp);
+	daddr_t bn, lastiblock[NIADDR], indir_lbn[NIADDR];
+	daddr_t blks[NDADDR + NIADDR];
+	struct fs *fs;
+	int offset, pgoffset, level;
+	int64_t count, blocksreleased = 0;
+	int i, aflag, nblocks;
+	int error, allerror = 0;
+	off_t osize;
+	int sync;
+	struct ufsmount *ump = oip->i_ump;
+
+	if (ovp->v_type == VCHR || ovp->v_type == VBLK ||
+	    ovp->v_type == VFIFO || ovp->v_type == VSOCK) {
+		KASSERT(oip->i_size == 0);
+		return 0;
+	}
+
+	if (length < 0)
+		return (EINVAL);
+
+	if (ovp->v_type == VLNK &&
+	    (oip->i_size < ump->um_maxsymlinklen ||
+	     (ump->um_maxsymlinklen == 0 && DIP(oip, blocks) == 0))) {
+		KDASSERT(length == 0);
+		memset(SHORTLINK(oip), 0, (size_t)oip->i_size);
+		oip->i_size = 0;
+		DIP_ASSIGN(oip, size, 0);
+		oip->i_flag |= IN_CHANGE | IN_UPDATE;
+		return (ffs_update(ovp, NULL, NULL, 0));
+	}
+	if (oip->i_size == length) {
+		/* still do a uvm_vnp_setsize() as writesize may be larger */
+		uvm_vnp_setsize(ovp, length);
+		oip->i_flag |= IN_CHANGE | IN_UPDATE;
+		return (ffs_update(ovp, NULL, NULL, 0));
+	}
+	fs = oip->i_fs;
+	if (length > ump->um_maxfilesize)
+		return (EFBIG);
+
+	if ((oip->i_flags & SF_SNAPSHOT) != 0)
+		ffs_snapremove(ovp);
+
+	osize = oip->i_size;
+	aflag = ioflag & IO_SYNC ? B_SYNC : 0;
+
+	/*
+	 * Lengthen the size of the file. We must ensure that the
+	 * last byte of the file is allocated. Since the smallest
+	 * value of osize is 0, length will be at least 1.
+	 */
+
+	if (osize < length) {
+		if (lblkno(fs, osize) < NDADDR &&
+		    lblkno(fs, osize) != lblkno(fs, length) &&
+		    blkroundup(fs, osize) != osize) {
+			off_t eob;
+
+			eob = blkroundup(fs, osize);
+			uvm_vnp_setwritesize(ovp, eob);
+			error = ufs_balloc_range(ovp, osize, eob - osize,
+			    cred, aflag);
+			if (error) {
+				(void) ffs_truncate(ovp, osize,
+				    ioflag & IO_SYNC, cred);
+				return error;
+			}
+			if (ioflag & IO_SYNC) {
+				mutex_enter(ovp->v_interlock);
+				VOP_PUTPAGES(ovp,
+				    trunc_page(osize & fs->fs_bmask),
+				    round_page(eob), PGO_CLEANIT | PGO_SYNCIO |
+				    PGO_JOURNALLOCKED);
+			}
+		}
+		uvm_vnp_setwritesize(ovp, length);
+		error = ufs_balloc_range(ovp, length - 1, 1, cred, aflag);
+		if (error) {
+			(void) ffs_truncate(ovp, osize, ioflag & IO_SYNC, cred);
+			return (error);
+		}
+		uvm_vnp_setsize(ovp, length);
+		oip->i_flag |= IN_CHANGE | IN_UPDATE;
+		KASSERT(ovp->v_size == oip->i_size);
+		return (ffs_update(ovp, NULL, NULL, 0));
+	}
+
+	/*
+	 * When truncating a regular file down to a non-block-aligned size,
+	 * we must zero the part of last block which is past the new EOF.
+	 * We must synchronously flush the zeroed pages to disk
+	 * since the new pages will be invalidated as soon as we
+	 * inform the VM system of the new, smaller size.
+	 * We must do this before acquiring the GLOCK, since fetching
+	 * the pages will acquire the GLOCK internally.
+	 * So there is a window where another thread could see a whole
+	 * zeroed page past EOF, but that's life.
+	 */
+
+	offset = blkoff(fs, length);
+	pgoffset = length & PAGE_MASK;
+	if (ovp->v_type == VREG && (pgoffset != 0 || offset != 0) &&
+	    osize > length) {
+		daddr_t lbn;
+		voff_t eoz;
+		int size;
+
+		if (offset != 0) {
+			error = ufs_balloc_range(ovp, length - 1, 1, cred,
+			    aflag);
+			if (error)
+				return error;
+		}
+		lbn = lblkno(fs, length);
+		size = blksize(fs, oip, lbn);
+		eoz = MIN(MAX(lblktosize(fs, lbn) + size, round_page(pgoffset)),
+		    osize);
+		ubc_zerorange(&ovp->v_uobj, length, eoz - length,
+		    UBC_UNMAP_FLAG(ovp));
+		if (round_page(eoz) > round_page(length)) {
+			mutex_enter(ovp->v_interlock);
+			error = VOP_PUTPAGES(ovp, round_page(length),
+			    round_page(eoz),
+			    PGO_CLEANIT | PGO_DEACTIVATE | PGO_JOURNALLOCKED |
+			    ((ioflag & IO_SYNC) ? PGO_SYNCIO : 0));
+			if (error)
+				return error;
+		}
+	}
+
+	genfs_node_wrlock(ovp);
+	oip->i_size = length;
+	DIP_ASSIGN(oip, size, length);
+	uvm_vnp_setsize(ovp, length);
+	/*
+	 * Calculate index into inode's block list of
+	 * last direct and indirect blocks (if any)
+	 * which we want to keep.  Lastblock is -1 when
+	 * the file is truncated to 0.
+	 */
+	lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
+	lastiblock[SINGLE] = lastblock - NDADDR;
+	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
+	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
+	nblocks = btodb(fs->fs_bsize);
+	/*
+	 * Update file and block pointers on disk before we start freeing
+	 * blocks.  If we crash before free'ing blocks below, the blocks
+	 * will be returned to the free list.  lastiblock values are also
+	 * normalized to -1 for calls to ffs_indirtrunc below.
+	 */
+	sync = 0;
+	for (level = TRIPLE; level >= SINGLE; level--) {
+		blks[NDADDR + level] = DIP(oip, ib[level]);
+		if (lastiblock[level] < 0 && blks[NDADDR + level] != 0) {
+			sync = 1;
+			DIP_ASSIGN(oip, ib[level], 0);
+			lastiblock[level] = -1;
+		}
+	}
+	for (i = 0; i < NDADDR; i++) {
+		blks[i] = DIP(oip, db[i]);
+		if (i > lastblock && blks[i] != 0) {
+			sync = 1;
+			DIP_ASSIGN(oip, db[i], 0);
+		}
+	}
+	oip->i_flag |= IN_CHANGE | IN_UPDATE;
+	if (sync) {
+		error = ffs_update(ovp, NULL, NULL, UPDATE_WAIT);
+		if (error && !allerror)
+			allerror = error;
+	}
+
+	/*
+	 * Having written the new inode to disk, save its new configuration
+	 * and put back the old block pointers long enough to process them.
+	 * Note that we save the new block configuration so we can check it
+	 * when we are done.
+	 */
+	for (i = 0; i < NDADDR; i++) {
+		bn = DIP(oip, db[i]);
+		DIP_ASSIGN(oip, db[i], blks[i]);
+		blks[i] = bn;
+	}
+	for (i = 0; i < NIADDR; i++) {
+		bn = DIP(oip, ib[i]);
+		DIP_ASSIGN(oip, ib[i], blks[NDADDR + i]);
+		blks[NDADDR + i] = bn;
+	}
+
+	oip->i_size = osize;
+	DIP_ASSIGN(oip, size, osize);
+	error = vtruncbuf(ovp, lastblock + 1, 0, 0);
+	if (error && !allerror)
+		allerror = error;
+
+	/*
+	 * Indirect blocks first.
+	 */
+	indir_lbn[SINGLE] = -NDADDR;
+	indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1;
+	indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
+	for (level = TRIPLE; level >= SINGLE; level--) {
+		if (oip->i_ump->um_fstype == UFS1)
+			bn = ufs_rw32(oip->i_ffs1_ib[level],UFS_FSNEEDSWAP(fs));
+		else
+			bn = ufs_rw64(oip->i_ffs2_ib[level],UFS_FSNEEDSWAP(fs));
+		if (bn != 0) {
+			error = ffs_indirtrunc(oip, indir_lbn[level],
+			    fsbtodb(fs, bn), lastiblock[level], level, &count);
+			if (error)
+				allerror = error;
+			blocksreleased += count;
+			if (lastiblock[level] < 0) {
+				DIP_ASSIGN(oip, ib[level], 0);
+				if (oip->i_ump->um_mountp->mnt_wapbl) {
+					UFS_WAPBL_REGISTER_DEALLOCATION(
+					    oip->i_ump->um_mountp,
+					    fsbtodb(fs, bn), fs->fs_bsize);
+				} else
+					ffs_blkfree(fs, oip->i_devvp, bn,
+					    fs->fs_bsize, oip->i_number);
+				blocksreleased += nblocks;
+			}
+		}
+		if (lastiblock[level] >= 0)
+			goto done;
+	}
+
+	/*
+	 * All whole direct blocks or frags.
+	 */
+	for (i = NDADDR - 1; i > lastblock; i--) {
+		long bsize;
+
+		if (oip->i_ump->um_fstype == UFS1)
+			bn = ufs_rw32(oip->i_ffs1_db[i], UFS_FSNEEDSWAP(fs));
+		else
+			bn = ufs_rw64(oip->i_ffs2_db[i], UFS_FSNEEDSWAP(fs));
+		if (bn == 0)
+			continue;
+		DIP_ASSIGN(oip, db[i], 0);
+		bsize = blksize(fs, oip, i);
+		if ((oip->i_ump->um_mountp->mnt_wapbl) &&
+		    (ovp->v_type != VREG)) {
+			UFS_WAPBL_REGISTER_DEALLOCATION(oip->i_ump->um_mountp,
+			    fsbtodb(fs, bn), bsize);
+		} else
+			ffs_blkfree(fs, oip->i_devvp, bn, bsize, oip->i_number);
+		blocksreleased += btodb(bsize);
+	}
+	if (lastblock < 0)
+		goto done;
+
+	/*
+	 * Finally, look for a change in size of the
+	 * last direct block; release any frags.
+	 */
+	if (oip->i_ump->um_fstype == UFS1)
+		bn = ufs_rw32(oip->i_ffs1_db[lastblock], UFS_FSNEEDSWAP(fs));
+	else
+		bn = ufs_rw64(oip->i_ffs2_db[lastblock], UFS_FSNEEDSWAP(fs));
+	if (bn != 0) {
+		long oldspace, newspace;
+
+		/*
+		 * Calculate amount of space we're giving
+		 * back as old block size minus new block size.
+		 */
+		oldspace = blksize(fs, oip, lastblock);
+		oip->i_size = length;
+		DIP_ASSIGN(oip, size, length);
+		newspace = blksize(fs, oip, lastblock);
+		if (newspace == 0)
+			panic("itrunc: newspace");
+		if (oldspace - newspace > 0) {
+			/*
+			 * Block number of space to be free'd is
+			 * the old block # plus the number of frags
+			 * required for the storage we're keeping.
+			 */
+			bn += numfrags(fs, newspace);
+			if ((oip->i_ump->um_mountp->mnt_wapbl) &&
+			    (ovp->v_type != VREG)) {
+				UFS_WAPBL_REGISTER_DEALLOCATION(
+				    oip->i_ump->um_mountp, fsbtodb(fs, bn),
+				    oldspace - newspace);
+			} else
+				ffs_blkfree(fs, oip->i_devvp, bn,
+				    oldspace - newspace, oip->i_number);
+			blocksreleased += btodb(oldspace - newspace);
+		}
+	}
+
+done:
+#ifdef DIAGNOSTIC
+	for (level = SINGLE; level <= TRIPLE; level++)
+		if (blks[NDADDR + level] != DIP(oip, ib[level]))
+			panic("itrunc1");
+	for (i = 0; i < NDADDR; i++)
+		if (blks[i] != DIP(oip, db[i]))
+			panic("itrunc2");
+	if (length == 0 &&
+	    (!LIST_EMPTY(&ovp->v_cleanblkhd) || !LIST_EMPTY(&ovp->v_dirtyblkhd)))
+		panic("itrunc3");
+#endif /* DIAGNOSTIC */
+	/*
+	 * Put back the real size.
+	 */
+	oip->i_size = length;
+	DIP_ASSIGN(oip, size, length);
+	DIP_ADD(oip, blocks, -blocksreleased);
+	genfs_node_unlock(ovp);
+	oip->i_flag |= IN_CHANGE;
+	UFS_WAPBL_UPDATE(ovp, NULL, NULL, 0);
+#if defined(QUOTA) || defined(QUOTA2)
+	(void) chkdq(oip, -blocksreleased, NOCRED, 0);
+#endif
+	KASSERT(ovp->v_type != VREG || ovp->v_size == oip->i_size);
+	return (allerror);
+}
+
+/*
+ * Release blocks associated with the inode ip and stored in the indirect
+ * block bn.  Blocks are free'd in LIFO order up to (but not including)
+ * lastbn.  If level is greater than SINGLE, the block is an indirect block
+ * and recursive calls to indirtrunc must be used to cleanse other indirect
+ * blocks.
+ *
+ * NB: triple indirect blocks are untested.
+ */
+static int
+ffs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn,
+    int level, int64_t *countp)
+{
+	int i;
+	struct buf *bp;
+	struct fs *fs = ip->i_fs;
+	int32_t *bap1 = NULL;
+	int64_t *bap2 = NULL;
+	struct vnode *vp;
+	daddr_t nb, nlbn, last;
+	char *copy = NULL;
+	int64_t blkcount, factor, blocksreleased = 0;
+	int nblocks;
+	int error = 0, allerror = 0;
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+#define RBAP(ip, i) (((ip)->i_ump->um_fstype == UFS1) ? \
+	    ufs_rw32(bap1[i], needswap) : ufs_rw64(bap2[i], needswap))
+#define BAP_ASSIGN(ip, i, value)					\
+	do {								\
+		if ((ip)->i_ump->um_fstype == UFS1)			\
+			bap1[i] = (value);				\
+		else							\
+			bap2[i] = (value);				\
+	} while(0)
+
+	/*
+	 * Calculate index in current block of last
+	 * block to be kept.  -1 indicates the entire
+	 * block so we need not calculate the index.
+	 */
+	factor = 1;
+	for (i = SINGLE; i < level; i++)
+		factor *= NINDIR(fs);
+	last = lastbn;
+	if (lastbn > 0)
+		last /= factor;
+	nblocks = btodb(fs->fs_bsize);
+	/*
+	 * Get buffer of block pointers, zero those entries corresponding
+	 * to blocks to be free'd, and update on disk copy first.  Since
+	 * double(triple) indirect before single(double) indirect, calls
+	 * to bmap on these blocks will fail.  However, we already have
+	 * the on disk address, so we have to set the b_blkno field
+	 * explicitly instead of letting bread do everything for us.
+	 */
+	vp = ITOV(ip);
+	error = ffs_getblk(vp, lbn, FFS_NOBLK, fs->fs_bsize, false, &bp);
+	if (error) {
+		*countp = 0;
+		return error;
+	}
+	if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+		/* Braces must be here in case trace evaluates to nothing. */
+		trace(TR_BREADHIT, pack(vp, fs->fs_bsize), lbn);
+	} else {
+		trace(TR_BREADMISS, pack(vp, fs->fs_bsize), lbn);
+		curlwp->l_ru.ru_inblock++;	/* pay for read */
+		bp->b_flags |= B_READ;
+		bp->b_flags &= ~B_COWDONE;	/* we change blkno below */
+		if (bp->b_bcount > bp->b_bufsize)
+			panic("ffs_indirtrunc: bad buffer size");
+		bp->b_blkno = dbn;
+		BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+		VOP_STRATEGY(vp, bp);
+		error = biowait(bp);
+		if (error == 0)
+			error = fscow_run(bp, true);
+	}
+	if (error) {
+		brelse(bp, 0);
+		*countp = 0;
+		return (error);
+	}
+
+	if (ip->i_ump->um_fstype == UFS1)
+		bap1 = (int32_t *)bp->b_data;
+	else
+		bap2 = (int64_t *)bp->b_data;
+	if (lastbn >= 0) {
+		copy = malloc(fs->fs_bsize, M_TEMP, M_WAITOK);
+		memcpy((void *)copy, bp->b_data, (u_int)fs->fs_bsize);
+		for (i = last + 1; i < NINDIR(fs); i++)
+			BAP_ASSIGN(ip, i, 0);
+		error = bwrite(bp);
+		if (error)
+			allerror = error;
+		if (ip->i_ump->um_fstype == UFS1)
+			bap1 = (int32_t *)copy;
+		else
+			bap2 = (int64_t *)copy;
+	}
+
+	/*
+	 * Recursively free totally unused blocks.
+	 */
+	for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
+	    i--, nlbn += factor) {
+		nb = RBAP(ip, i);
+		if (nb == 0)
+			continue;
+		if (level > SINGLE) {
+			error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
+					       (daddr_t)-1, level - 1,
+					       &blkcount);
+			if (error)
+				allerror = error;
+			blocksreleased += blkcount;
+		}
+		if ((ip->i_ump->um_mountp->mnt_wapbl) &&
+		    ((level > SINGLE) || (ITOV(ip)->v_type != VREG))) {
+			UFS_WAPBL_REGISTER_DEALLOCATION(ip->i_ump->um_mountp,
+			    fsbtodb(fs, nb), fs->fs_bsize);
+		} else
+			ffs_blkfree(fs, ip->i_devvp, nb, fs->fs_bsize,
+			    ip->i_number);
+		blocksreleased += nblocks;
+	}
+
+	/*
+	 * Recursively free last partial block.
+	 */
+	if (level > SINGLE && lastbn >= 0) {
+		last = lastbn % factor;
+		nb = RBAP(ip, i);
+		if (nb != 0) {
+			error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
+					       last, level - 1, &blkcount);
+			if (error)
+				allerror = error;
+			blocksreleased += blkcount;
+		}
+	}
+
+	if (copy != NULL) {
+		free(copy, M_TEMP);
+	} else {
+		brelse(bp, BC_INVAL);
+	}
+
+	*countp = blocksreleased;
+	return (allerror);
+}
+
+void
+ffs_itimes(struct inode *ip, const struct timespec *acc,
+    const struct timespec *mod, const struct timespec *cre)
+{
+	struct timespec now;
+
+	if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) {
+		return;
+	}
+
+	vfs_timestamp(&now);
+	if (ip->i_flag & IN_ACCESS) {
+		if (acc == NULL)
+			acc = &now;
+		DIP_ASSIGN(ip, atime, acc->tv_sec);
+		DIP_ASSIGN(ip, atimensec, acc->tv_nsec);
+	}
+	if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) {
+		if ((ip->i_flags & SF_SNAPSHOT) == 0) {
+			if (mod == NULL)
+				mod = &now;
+			DIP_ASSIGN(ip, mtime, mod->tv_sec);
+			DIP_ASSIGN(ip, mtimensec, mod->tv_nsec);
+		}
+		ip->i_modrev++;
+	}
+	if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) {
+		if (cre == NULL)
+			cre = &now;
+		DIP_ASSIGN(ip, ctime, cre->tv_sec);
+		DIP_ASSIGN(ip, ctimensec, cre->tv_nsec);
+	}
+	if (ip->i_flag & (IN_ACCESS | IN_MODIFY))
+		ip->i_flag |= IN_ACCESSED;
+	if (ip->i_flag & (IN_UPDATE | IN_CHANGE))
+		ip->i_flag |= IN_MODIFIED;
+	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
+}
diff --git a/sys/ufs/ffs/ffs_quota2.c b/sys/ufs/ffs/ffs_quota2.c
new file mode 100644
index 000000000..b3d45b38d
--- /dev/null
+++ b/sys/ufs/ffs/ffs_quota2.c
@@ -0,0 +1,118 @@
+/* $NetBSD: ffs_quota2.c,v 1.4 2011/06/12 03:36:00 rmind Exp $ */
+/*-
+  * Copyright (c) 2010 Manuel Bouyer
+  * All rights reserved.
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions
+  * are met:
+  * 1. Redistributions of source code must retain the above copyright
+  *    notice, this list of conditions and the following disclaimer.
+  * 2. Redistributions in binary form must reproduce the above copyright
+  *    notice, this list of conditions and the following disclaimer in the
+  *    documentation and/or other materials provided with the distribution.
+  *
+  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  * POSSIBILITY OF SUCH DAMAGE.
+  */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_quota2.c,v 1.4 2011/06/12 03:36:00 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/quota2.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ffs/ffs_extern.h>
+#include <ufs/ffs/fs.h>
+
+
+int
+ffs_quota2_mount(struct mount *mp)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	int error = 0;
+	struct vnode *vp;
+	struct lwp *l = curlwp;
+
+	if ((fs->fs_flags & FS_DOQUOTA2) == 0)
+		return 0;
+
+	ump->um_flags |= UFS_QUOTA2;
+	ump->umq2_bsize = fs->fs_bsize;
+	ump->umq2_bmask = fs->fs_qbmask;
+	if (fs->fs_quota_magic != Q2_HEAD_MAGIC) {
+		printf("%s: Invalid quota magic number\n",
+		    mp->mnt_stat.f_mntonname);
+		return EINVAL;
+	}
+        if ((fs->fs_quota_flags & FS_Q2_DO_TYPE(USRQUOTA)) &&
+            fs->fs_quotafile[USRQUOTA] == 0) {
+                printf("%s: no user quota inode\n",
+		    mp->mnt_stat.f_mntonname); 
+                error = EINVAL;
+        }
+        if ((fs->fs_quota_flags & FS_Q2_DO_TYPE(GRPQUOTA)) &&
+            fs->fs_quotafile[GRPQUOTA] == 0) {
+                printf("%s: no group quota inode\n",
+		    mp->mnt_stat.f_mntonname);
+                error = EINVAL;
+        }
+	if (error)
+		return error;
+
+        if (fs->fs_quota_flags & FS_Q2_DO_TYPE(USRQUOTA) &&
+	    ump->um_quotas[USRQUOTA] == NULLVP) {
+		error = VFS_VGET(mp, fs->fs_quotafile[USRQUOTA], &vp);
+		if (error) {
+			printf("%s: can't vget() user quota inode: %d\n",
+			    mp->mnt_stat.f_mntonname, error);
+			return error;
+		}
+		ump->um_quotas[USRQUOTA] = vp;
+		ump->um_cred[USRQUOTA] = l->l_cred;
+		mutex_enter(vp->v_interlock);
+		vp->v_writecount++;
+		mutex_exit(vp->v_interlock);
+		VOP_UNLOCK(vp);
+	}
+        if (fs->fs_quota_flags & FS_Q2_DO_TYPE(GRPQUOTA) &&
+	    ump->um_quotas[GRPQUOTA] == NULLVP) {
+		error = VFS_VGET(mp, fs->fs_quotafile[GRPQUOTA], &vp);
+		if (error) {
+			vn_close(ump->um_quotas[USRQUOTA],
+			    FREAD|FWRITE, l->l_cred);
+			printf("%s: can't vget() group quota inode: %d\n",
+			    mp->mnt_stat.f_mntonname, error);
+			return error;
+		}
+		ump->um_quotas[GRPQUOTA] = vp;
+		ump->um_cred[GRPQUOTA] = l->l_cred;
+		mutex_enter(vp->v_interlock);
+		vp->v_vflag |= VV_SYSTEM;
+		vp->v_writecount++;
+		mutex_exit(vp->v_interlock);
+		VOP_UNLOCK(vp);
+	}
+	mp->mnt_flag |= MNT_QUOTA;
+	return 0;
+}
diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c
new file mode 100644
index 000000000..b1e07c11c
--- /dev/null
+++ b/sys/ufs/ffs/ffs_snapshot.c
@@ -0,0 +1,2331 @@
+/*	$NetBSD: ffs_snapshot.c,v 1.118 2011/10/07 09:35:06 hannken Exp $	*/
+
+/*
+ * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
+ *
+ * Further information about snapshots can be obtained from:
+ *
+ *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
+ *	1614 Oxford Street		mckusick@mckusick.com
+ *	Berkeley, CA 94709-1608		+1-510-843-9542
+ *	USA
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
+ *
+ *	from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.118 2011/10/07 09:35:06 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/sched.h>
+#include <sys/stat.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+#include <sys/vnode.h>
+#include <sys/kauth.h>
+#include <sys/fstrans.h>
+#include <sys/wapbl.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#include <uvm/uvm.h>
+
+struct snap_info {
+	kmutex_t si_lock;			/* Lock this snapinfo */
+	kmutex_t si_snaplock;			/* Snapshot vnode common lock */
+	lwp_t *si_owner;			/* Sanplock owner */
+	TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */
+	daddr_t *si_snapblklist;		/* Snapshot block hints list */
+	uint32_t si_gen;			/* Incremented on change */
+};
+
+#if !defined(FFS_NO_SNAPSHOT)
+typedef int (*acctfunc_t)
+    (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
+
+static int snapshot_setup(struct mount *, struct vnode *);
+static int snapshot_copyfs(struct mount *, struct vnode *, void **);
+static int snapshot_expunge(struct mount *, struct vnode *,
+    struct fs *, daddr_t *, daddr_t **);
+static int snapshot_expunge_snap(struct mount *, struct vnode *,
+    struct fs *, daddr_t);
+static int snapshot_writefs(struct mount *, struct vnode *, void *);
+static int cgaccount(struct vnode *, int, int *);
+static int cgaccount1(int, struct vnode *, void *, int);
+static int expunge(struct vnode *, struct inode *, struct fs *,
+    acctfunc_t, int);
+static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
+    daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
+static int fullacct(struct vnode *, void *, int, int, struct fs *,
+    daddr_t, int);
+static int snapacct(struct vnode *, void *, int, int, struct fs *,
+    daddr_t, int);
+static int mapacct(struct vnode *, void *, int, int, struct fs *,
+    daddr_t, int);
+#endif /* !defined(FFS_NO_SNAPSHOT) */
+
+static int ffs_copyonwrite(void *, struct buf *, bool);
+static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
+static int rwfsblk(struct vnode *, int, void *, daddr_t);
+static int syncsnap(struct vnode *);
+static int wrsnapblk(struct vnode *, void *, daddr_t);
+#if !defined(FFS_NO_SNAPSHOT)
+static int blocks_in_journal(struct fs *);
+#endif
+
+static inline bool is_active_snapshot(struct snap_info *, struct inode *);
+static inline daddr_t db_get(struct inode *, int);
+static inline void db_assign(struct inode *, int, daddr_t);
+static inline daddr_t ib_get(struct inode *, int);
+static inline void ib_assign(struct inode *, int, daddr_t);
+static inline daddr_t idb_get(struct inode *, void *, int);
+static inline void idb_assign(struct inode *, void *, int, daddr_t);
+
+#ifdef DEBUG
+static int snapdebug = 0;
+#endif
+
+int
+ffs_snapshot_init(struct ufsmount *ump)
+{
+	struct snap_info *si;
+
+	si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
+	if (si == NULL)
+		return ENOMEM;
+
+	TAILQ_INIT(&si->si_snapshots);
+	mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
+	mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
+	si->si_owner = NULL;
+	si->si_gen = 0;
+	si->si_snapblklist = NULL;
+
+	return 0;
+}
+
+void
+ffs_snapshot_fini(struct ufsmount *ump)
+{
+	struct snap_info *si;
+
+	si = ump->um_snapinfo;
+	ump->um_snapinfo = NULL;
+
+	KASSERT(TAILQ_EMPTY(&si->si_snapshots));
+	mutex_destroy(&si->si_lock);
+	mutex_destroy(&si->si_snaplock);
+	KASSERT(si->si_snapblklist == NULL);
+	kmem_free(si, sizeof(*si));
+}
+
+/*
+ * Create a snapshot file and initialize it for the filesystem.
+ * Vnode is locked on entry and return.
+ */
+int
+ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
+{
+#if defined(FFS_NO_SNAPSHOT)
+	return EOPNOTSUPP;
+}
+#else /* defined(FFS_NO_SNAPSHOT) */
+	bool suspended = false;
+	int error, redo = 0, snaploc;
+	void *sbbuf = NULL;
+	daddr_t *snaplist = NULL, snaplistsize = 0;
+	struct buf *bp, *nbp;
+	struct fs *copy_fs = NULL;
+	struct fs *fs = VFSTOUFS(mp)->um_fs;
+	struct inode *ip = VTOI(vp);
+	struct lwp *l = curlwp;
+	struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
+	struct timespec ts;
+	struct timeval starttime;
+#ifdef DEBUG
+	struct timeval endtime;
+#endif
+	struct vnode *devvp = ip->i_devvp;
+
+	/*
+	 * If the vnode already is a snapshot, return.
+	 */
+	if ((VTOI(vp)->i_flags & SF_SNAPSHOT)) {
+		if ((VTOI(vp)->i_flags & SF_SNAPINVAL))
+			return EINVAL;
+		if (ctime) {
+			ctime->tv_sec = DIP(VTOI(vp), mtime);
+			ctime->tv_nsec = DIP(VTOI(vp), mtimensec);
+		}
+		return 0;
+	}
+	/*
+	 * Check for free snapshot slot in the superblock.
+	 */
+	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
+		if (fs->fs_snapinum[snaploc] == 0)
+			break;
+	if (snaploc == FSMAXSNAP)
+		return (ENOSPC);
+	/*
+	 * Prepare the vnode to become a snapshot.
+	 */
+	error = snapshot_setup(mp, vp);
+	if (error)
+		goto out;
+
+	/*
+	 * Copy all the cylinder group maps. Although the
+	 * filesystem is still active, we hope that only a few
+	 * cylinder groups will change between now and when we
+	 * suspend operations. Thus, we will be able to quickly
+	 * touch up the few cylinder groups that changed during
+	 * the suspension period.
+	 */
+	error = cgaccount(vp, 1, NULL);
+	if (error)
+		goto out;
+
+	/*
+	 * snapshot is now valid
+	 */
+	ip->i_flags &= ~SF_SNAPINVAL;
+	DIP_ASSIGN(ip, flags, ip->i_flags);
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+
+	/*
+	 * Ensure that the snapshot is completely on disk.
+	 * Since we have marked it as a snapshot it is safe to
+	 * unlock it as no process will be allowed to write to it.
+	 */
+	error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
+	if (error)
+		goto out;
+	VOP_UNLOCK(vp);
+	/*
+	 * All allocations are done, so we can now suspend the filesystem.
+	 */
+	error = vfs_suspend(vp->v_mount, 0);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	if (error)
+		goto out;
+	suspended = true;
+	getmicrotime(&starttime);
+	/*
+	 * First, copy all the cylinder group maps that have changed.
+	 */
+	error = cgaccount(vp, 2, &redo);
+	if (error)
+		goto out;
+	/*
+	 * Create a copy of the superblock and its summary information.
+	 */
+	error = snapshot_copyfs(mp, vp, &sbbuf);
+	copy_fs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
+	if (error)
+		goto out;
+	/*
+	 * Expunge unlinked files from our view.
+	 */
+	error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
+	if (error)
+		goto out;
+	/*
+	 * Record snapshot inode. Since this is the newest snapshot,
+	 * it must be placed at the end of the list.
+	 */
+	if (ip->i_nlink > 0)
+		fs->fs_snapinum[snaploc] = ip->i_number;
+
+	mutex_enter(&si->si_lock);
+	if (is_active_snapshot(si, ip))
+		panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
+	TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
+	if (TAILQ_FIRST(&si->si_snapshots) == ip) {
+		/*
+		 * If this is the first snapshot on this filesystem, put the
+		 * preliminary list in place and establish the cow handler.
+		 */
+		si->si_snapblklist = snaplist;
+		fscow_establish(mp, ffs_copyonwrite, devvp);
+	}
+	si->si_gen++;
+	mutex_exit(&si->si_lock);
+
+	vp->v_vflag |= VV_SYSTEM;
+	/*
+	 * Set the mtime to the time the snapshot has been taken.
+	 */
+	TIMEVAL_TO_TIMESPEC(&starttime, &ts);
+	if (ctime)
+		*ctime = ts;
+	DIP_ASSIGN(ip, mtime, ts.tv_sec);
+	DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	/*
+	 * Copy allocation information from all snapshots and then
+	 * expunge them from our view.
+	 */
+	error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
+	if (error)
+		goto out;
+	/*
+	 * Write the superblock and its summary information to the snapshot.
+	 */
+	error = snapshot_writefs(mp, vp, sbbuf);
+	if (error)
+		goto out;
+	/*
+	 * We're nearly done, ensure that the snapshot is completely on disk.
+	 */
+	error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
+	if (error)
+		goto out;
+	/*
+	 * Invalidate and free all pages on the snapshot vnode.
+	 * We will read and write through the buffercache.
+	 */
+	mutex_enter(vp->v_interlock);
+	error = VOP_PUTPAGES(vp, 0, 0,
+		    PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
+	if (error)
+		goto out;
+	/*
+	 * Invalidate short ( < fs_bsize ) buffers.  We will always read
+	 * full size buffers later.
+	 */
+	mutex_enter(&bufcache_lock);
+	KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
+	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
+		nbp = LIST_NEXT(bp, b_vnbufs);
+		KASSERT((bp->b_cflags & BC_BUSY) == 0);
+		if (bp->b_bcount < fs->fs_bsize) {
+			bp->b_cflags |= BC_BUSY;
+			brelsel(bp, BC_INVAL | BC_VFLUSH);
+		}
+	}
+	mutex_exit(&bufcache_lock);
+
+out:
+	if (sbbuf != NULL) {
+		free(copy_fs->fs_csp, M_UFSMNT);
+		free(sbbuf, M_UFSMNT);
+	}
+	if (fs->fs_active != NULL) {
+		free(fs->fs_active, M_DEVBUF);
+		fs->fs_active = NULL;
+	}
+
+	mutex_enter(&si->si_lock);
+	if (snaplist != NULL) {
+		if (si->si_snapblklist == snaplist)
+			si->si_snapblklist = NULL;
+		free(snaplist, M_UFSMNT);
+	}
+	if (error) {
+		fs->fs_snapinum[snaploc] = 0;
+	} else {
+		/*
+		 * As this is the newest list, it is the most inclusive, so
+		 * should replace the previous list.
+		 */
+		si->si_snapblklist = ip->i_snapblklist;
+	}
+	si->si_gen++;
+	mutex_exit(&si->si_lock);
+
+	if (suspended) {
+		VOP_UNLOCK(vp);
+		vfs_resume(vp->v_mount);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef DEBUG
+		getmicrotime(&endtime);
+		timersub(&endtime, &starttime, &endtime);
+		printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
+		    mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
+		    endtime.tv_usec / 1000, redo, fs->fs_ncg);
+#endif
+	}
+	if (error) {
+		if (!UFS_WAPBL_BEGIN(mp)) {
+			(void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
+			UFS_WAPBL_END(mp);
+		}
+	} else if (ip->i_nlink > 0)
+		vref(vp);
+	return (error);
+}
+
+/*
+ * Prepare vnode to become a snapshot.
+ */
+static int
+snapshot_setup(struct mount *mp, struct vnode *vp)
+{
+	int error, n, len, loc, cg;
+	daddr_t blkno, numblks;
+	struct buf *ibp, *nbp;
+	struct fs *fs = VFSTOUFS(mp)->um_fs;
+	struct lwp *l = curlwp;
+	const int wbreak = blocks_in_journal(fs)/8;
+	struct inode *ip = VTOI(vp);
+
+	/*
+	 * Check mount, exclusive reference and owner.
+	 */
+	if (vp->v_mount != mp)
+		return EXDEV;
+	if (vp->v_usecount != 1 || vp->v_writecount != 0)
+		return EBUSY;
+	if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+	    NULL) != 0 &&
+	    VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred))
+		return EACCES;
+
+	if (vp->v_size != 0) {
+		error = ffs_truncate(vp, 0, 0, NOCRED);
+		if (error)
+			return error;
+	}
+
+	/* Change inode to snapshot type file. */
+	error = UFS_WAPBL_BEGIN(mp);
+	if (error)
+		return error;
+#if defined(QUOTA) || defined(QUOTA2)
+	/* shapshot inodes are not accounted in quotas */
+	chkiq(ip, -1, l->l_cred, 0);
+#endif
+	ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL);
+	DIP_ASSIGN(ip, flags, ip->i_flags);
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	ffs_update(vp, NULL, NULL, UPDATE_WAIT);
+	UFS_WAPBL_END(mp);
+
+	KASSERT(ip->i_flags & SF_SNAPSHOT);
+	/*
+	 * Write an empty list of preallocated blocks to the end of
+	 * the snapshot to set size to at least that of the filesystem.
+	 */
+	numblks = howmany(fs->fs_size, fs->fs_frag);
+	blkno = 1;
+	blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
+	error = vn_rdwr(UIO_WRITE, vp,
+	    (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks),
+	    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
+	if (error)
+		return error;
+	/*
+	 * Preallocate critical data structures so that we can copy
+	 * them in without further allocation after we suspend all
+	 * operations on the filesystem. We would like to just release
+	 * the allocated buffers without writing them since they will
+	 * be filled in below once we are ready to go, but this upsets
+	 * the soft update code, so we go ahead and write the new buffers.
+	 *
+	 * Allocate all indirect blocks and mark all of them as not
+	 * needing to be copied.
+	 */
+	error = UFS_WAPBL_BEGIN(mp);
+	if (error)
+		return error;
+	for (blkno = NDADDR, n = 0; blkno < numblks; blkno += NINDIR(fs)) {
+		error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
+		    fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
+		if (error)
+			goto out;
+		brelse(ibp, 0);
+		if (wbreak > 0 && (++n % wbreak) == 0) {
+			UFS_WAPBL_END(mp);
+			error = UFS_WAPBL_BEGIN(mp);
+			if (error)
+				return error;
+		}
+	}
+	/*
+	 * Allocate copies for the superblock and its summary information.
+	 */
+	error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
+	    0, &nbp);
+	if (error)
+		goto out;
+	bawrite(nbp);
+	blkno = fragstoblks(fs, fs->fs_csaddr);
+	len = howmany(fs->fs_cssize, fs->fs_bsize);
+	for (loc = 0; loc < len; loc++) {
+		error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)),
+		    fs->fs_bsize, l->l_cred, 0, &nbp);
+		if (error)
+			goto out;
+		bawrite(nbp);
+		if (wbreak > 0 && (++n % wbreak) == 0) {
+			UFS_WAPBL_END(mp);
+			error = UFS_WAPBL_BEGIN(mp);
+			if (error)
+				return error;
+		}
+	}
+	/*
+	 * Allocate all cylinder group blocks.
+	 */
+	for (cg = 0; cg < fs->fs_ncg; cg++) {
+		error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
+		    fs->fs_bsize, l->l_cred, 0, &nbp);
+		if (error)
+			goto out;
+		bawrite(nbp);
+		if (wbreak > 0 && (++n % wbreak) == 0) {
+			UFS_WAPBL_END(mp);
+			error = UFS_WAPBL_BEGIN(mp);
+			if (error)
+				return error;
+		}
+	}
+
+out:
+	UFS_WAPBL_END(mp);
+	return error;
+}
+
+/*
+ * Create a copy of the superblock and its summary information.
+ * It is up to the caller to free copyfs and copy_fs->fs_csp.
+ */
+static int
+snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
+{
+	int error, i, len, loc, size;
+	void *space;
+	int32_t *lp;
+	struct buf *bp;
+	struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
+	struct lwp *l = curlwp;
+	struct vnode *devvp = VTOI(vp)->i_devvp;
+
+	/*
+	 * Grab a copy of the superblock and its summary information.
+	 * We delay writing it until the suspension is released below.
+	 */
+	*sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
+	loc = blkoff(fs, fs->fs_sblockloc);
+	if (loc > 0)
+		memset(*sbbuf, 0, loc);
+	copyfs = (struct fs *)((char *)(*sbbuf) + loc);
+	memcpy(copyfs, fs, fs->fs_sbsize);
+	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
+	if (fs->fs_sbsize < size)
+		memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0, 
+		    size - fs->fs_sbsize);
+	size = blkroundup(fs, fs->fs_cssize);
+	if (fs->fs_contigsumsize > 0)
+		size += fs->fs_ncg * sizeof(int32_t);
+	space = malloc(size, M_UFSMNT, M_WAITOK);
+	copyfs->fs_csp = space;
+	memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
+	space = (char *)space + fs->fs_cssize;
+	loc = howmany(fs->fs_cssize, fs->fs_fsize);
+	i = fs->fs_frag - loc % fs->fs_frag;
+	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
+	if (len > 0) {
+		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
+		    len, l->l_cred, 0, &bp)) != 0) {
+			brelse(bp, 0);
+			free(copyfs->fs_csp, M_UFSMNT);
+			free(*sbbuf, M_UFSMNT);
+			*sbbuf = NULL;
+			return error;
+		}
+		memcpy(space, bp->b_data, (u_int)len);
+		space = (char *)space + len;
+		brelse(bp, BC_INVAL | BC_NOCACHE);
+	}
+	if (fs->fs_contigsumsize > 0) {
+		copyfs->fs_maxcluster = lp = space;
+		for (i = 0; i < fs->fs_ncg; i++)
+			*lp++ = fs->fs_contigsumsize;
+	}
+	if (mp->mnt_wapbl)
+		copyfs->fs_flags &= ~FS_DOWAPBL;
+	return 0;
+}
+
+/*
+ * We must check for active files that have been unlinked (e.g., with a zero
+ * link count). We have to expunge all trace of these files from the snapshot
+ * so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
+ * Note that we skip unlinked snapshot files as they will be handled separately.
+ * Calculate the snapshot list size and create a preliminary list.
+ */
+static int
+snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
+    daddr_t *snaplistsize, daddr_t **snaplist)
+{
+	int cg, error = 0, len, loc;
+	daddr_t blkno, *blkp;
+	struct fs *fs = VFSTOUFS(mp)->um_fs;
+	struct inode *xp;
+	struct lwp *l = curlwp;
+	struct vattr vat;
+	struct vnode *logvp = NULL, *mvp = NULL, *xvp;
+
+	*snaplist = NULL;
+	/*
+	 * Get the log inode if any.
+	 */
+	if ((fs->fs_flags & FS_DOWAPBL) &&
+	    fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
+		error = VFS_VGET(mp,
+		    fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp);
+		if (error)
+			goto out;
+	}
+	/*
+	 * Allocate a marker vnode.
+	 */
+	mvp = vnalloc(mp);
+	/*
+	 * We also calculate the needed size for the snapshot list.
+	 */
+	*snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
+	    FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
+	mutex_enter(&mntvnode_lock);
+	/*
+	 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+	 * and vclean() can be called indirectly
+	 */
+	for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) {
+		vmark(mvp, xvp);
+		/*
+		 * Make sure this vnode wasn't reclaimed in getnewvnode().
+		 * Start over if it has (it won't be on the list anymore).
+		 */
+		if (xvp->v_mount != mp || vismarker(xvp))
+			continue;
+		mutex_enter(xvp->v_interlock);
+		if ((xvp->v_iflag & VI_XLOCK) ||
+		    xvp->v_usecount == 0 || xvp->v_type == VNON ||
+		    VTOI(xvp) == NULL ||
+		    (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
+			mutex_exit(xvp->v_interlock);
+			continue;
+		}
+		mutex_exit(&mntvnode_lock);
+		/*
+		 * XXXAD should increase vnode ref count to prevent it
+		 * disappearing or being recycled.
+		 */
+		mutex_exit(xvp->v_interlock);
+#ifdef DEBUG
+		if (snapdebug)
+			vprint("ffs_snapshot: busy vnode", xvp);
+#endif
+		xp = VTOI(xvp);
+		if (xvp != logvp) {
+			if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 &&
+			    vat.va_nlink > 0) {
+				mutex_enter(&mntvnode_lock);
+				continue;
+			}
+			if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
+				mutex_enter(&mntvnode_lock);
+				continue;
+			}
+		}
+		/*
+		 * If there is a fragment, clear it here.
+		 */
+		blkno = 0;
+		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
+		if (loc < NDADDR) {
+			len = fragroundup(fs, blkoff(fs, xp->i_size));
+			if (len > 0 && len < fs->fs_bsize) {
+				error = UFS_WAPBL_BEGIN(mp);
+				if (error) {
+					(void)vunmark(mvp);
+					goto out;
+				}
+				ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
+				    len, xp->i_number);
+				blkno = db_get(xp, loc);
+				db_assign(xp, loc, 0);
+				UFS_WAPBL_END(mp);
+			}
+		}
+		*snaplistsize += 1;
+		error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
+		if (blkno)
+			db_assign(xp, loc, blkno);
+		if (!error) {
+			error = UFS_WAPBL_BEGIN(mp);
+			if (!error) {
+				error = ffs_freefile_snap(copy_fs, vp,
+				    xp->i_number, xp->i_mode);
+				UFS_WAPBL_END(mp);
+			}
+		}
+		if (error) {
+			(void)vunmark(mvp);
+			goto out;
+		}
+		mutex_enter(&mntvnode_lock);
+	}
+	mutex_exit(&mntvnode_lock);
+	/*
+	 * Create a preliminary list of preallocated snapshot blocks.
+	 */
+	*snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
+	blkp = &(*snaplist)[1];
+	*blkp++ = lblkno(fs, fs->fs_sblockloc);
+	blkno = fragstoblks(fs, fs->fs_csaddr);
+	for (cg = 0; cg < fs->fs_ncg; cg++) {
+		if (fragstoblks(fs, cgtod(fs, cg)) > blkno)
+			break;
+		*blkp++ = fragstoblks(fs, cgtod(fs, cg));
+	}
+	len = howmany(fs->fs_cssize, fs->fs_bsize);
+	for (loc = 0; loc < len; loc++)
+		*blkp++ = blkno + loc;
+	for (; cg < fs->fs_ncg; cg++)
+		*blkp++ = fragstoblks(fs, cgtod(fs, cg));
+	(*snaplist)[0] = blkp - &(*snaplist)[0];
+
+out:
+	if (mvp != NULL)
+		vnfree(mvp);
+	if (logvp != NULL)
+		vput(logvp);
+	if (error && *snaplist != NULL) {
+		free(*snaplist, M_UFSMNT);
+		*snaplist = NULL;
+	}
+
+	return error;
+}
+
+/*
+ * Copy allocation information from all the snapshots in this snapshot and
+ * then expunge them from its view. Also, collect the list of allocated
+ * blocks in i_snapblklist.
+ */
+static int
+snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
+    struct fs *copy_fs, daddr_t snaplistsize)
+{
+	int error = 0, i;
+	daddr_t numblks, *snaplist = NULL;
+	struct fs *fs = VFSTOUFS(mp)->um_fs;
+	struct inode *ip = VTOI(vp), *xp;
+	struct lwp *l = curlwp;
+	struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
+
+	TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
+		if (xp != ip) {
+			error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
+			if (error)
+				break;
+		}
+		if (xp->i_nlink != 0)
+			continue;
+		error = UFS_WAPBL_BEGIN(mp);
+		if (error)
+			break;
+		error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
+		UFS_WAPBL_END(mp);
+		if (error)
+			break;
+	}
+	if (error)
+		goto out;
+	/*
+	 * Allocate space for the full list of preallocated snapshot blocks.
+	 */
+	snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
+	ip->i_snapblklist = &snaplist[1];
+	/*
+	 * Expunge the blocks used by the snapshots from the set of
+	 * blocks marked as used in the snapshot bitmaps. Also, collect
+	 * the list of allocated blocks in i_snapblklist.
+	 */
+	error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
+	if (error)
+		goto out;
+	if (snaplistsize < ip->i_snapblklist - snaplist)
+		panic("ffs_snapshot: list too small");
+	snaplistsize = ip->i_snapblklist - snaplist;
+	snaplist[0] = snaplistsize;
+	ip->i_snapblklist = &snaplist[0];
+	/*
+	 * Write out the list of allocated blocks to the end of the snapshot.
+	 */
+	numblks = howmany(fs->fs_size, fs->fs_frag);
+	for (i = 0; i < snaplistsize; i++)
+		snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
+	error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
+	    snaplistsize * sizeof(daddr_t), lblktosize(fs, (off_t)numblks),
+	    UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
+	for (i = 0; i < snaplistsize; i++)
+		snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
+out:
+	if (error && snaplist != NULL) {
+		free(snaplist, M_UFSMNT);
+		ip->i_snapblklist = NULL;
+	}
+	return error;
+}
+
+/*
+ * Write the superblock and its summary information to the snapshot.
+ * Make sure, the first NDADDR blocks get copied to the snapshot.
+ */
+static int
+snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
+{
+	int error, len, loc;
+	void *space;
+	daddr_t blkno;
+	struct buf *bp;
+	struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
+	struct inode *ip = VTOI(vp);
+	struct lwp *l = curlwp;
+
+	copyfs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
+
+	/*
+	 * Write the superblock and its summary information
+	 * to the snapshot.
+	 */
+	blkno = fragstoblks(fs, fs->fs_csaddr);
+	len = howmany(fs->fs_cssize, fs->fs_bsize);
+	space = copyfs->fs_csp;
+#ifdef FFS_EI
+	if (UFS_FSNEEDSWAP(fs)) {
+		ffs_sb_swap(copyfs, copyfs);
+		ffs_csum_swap(space, space, fs->fs_cssize);
+	}
+#endif
+	error = UFS_WAPBL_BEGIN(mp);
+	if (error)
+		return error;
+	for (loc = 0; loc < len; loc++) {
+		error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred,
+		    B_MODIFY, &bp);
+		if (error) {
+			brelse(bp, 0);
+			break;
+		}
+		memcpy(bp->b_data, space, fs->fs_bsize);
+		space = (char *)space + fs->fs_bsize;
+		bawrite(bp);
+	}
+	if (error)
+		goto out;
+	error = bread(vp, lblkno(fs, fs->fs_sblockloc),
+	    fs->fs_bsize, l->l_cred, B_MODIFY, &bp);
+	if (error) {
+		brelse(bp, 0);
+		goto out;
+	} else {
+		memcpy(bp->b_data, sbbuf, fs->fs_bsize);
+		bawrite(bp);
+	}
+	/*
+	 * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite()
+	 * and ffs_snapblkfree() will always work on indirect blocks.
+	 */
+	for (loc = 0; loc < NDADDR; loc++) {
+		if (db_get(ip, loc) != 0)
+			continue;
+		error = ffs_balloc(vp, lblktosize(fs, (off_t)loc),
+		    fs->fs_bsize, l->l_cred, 0, &bp);
+		if (error)
+			break;
+		error = rwfsblk(vp, B_READ, bp->b_data, loc);
+		if (error) {
+			brelse(bp, 0);
+			break;
+		}
+		bawrite(bp);
+	}
+
+out:
+	UFS_WAPBL_END(mp);
+	return error;
+}
+
+/*
+ * Copy all cylinder group maps.
+ */
+static int
+cgaccount(struct vnode *vp, int passno, int *redo)
+{
+	int cg, error = 0;
+	struct buf *nbp;
+	struct fs *fs = VTOI(vp)->i_fs;
+
+	if (redo != NULL)
+		*redo = 0;
+	if (passno == 1)
+		fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
+		    M_DEVBUF, M_WAITOK | M_ZERO);
+	for (cg = 0; cg < fs->fs_ncg; cg++) {
+		if (passno == 2 && ACTIVECG_ISSET(fs, cg))
+			continue;
+
+		if (redo != NULL)
+			*redo += 1;
+		error = UFS_WAPBL_BEGIN(vp->v_mount);
+		if (error)
+			return error;
+		error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
+		    fs->fs_bsize, curlwp->l_cred, 0, &nbp);
+		if (error) {
+			UFS_WAPBL_END(vp->v_mount);
+			break;
+		}
+		error = cgaccount1(cg, vp, nbp->b_data, passno);
+		bawrite(nbp);
+		UFS_WAPBL_END(vp->v_mount);
+		if (error)
+			break;
+	}
+	return error;
+}
+
+/*
+ * Copy a cylinder group map. All the unallocated blocks are marked
+ * BLK_NOCOPY so that the snapshot knows that it need not copy them
+ * if they are later written. If passno is one, then this is a first
+ * pass, so only setting needs to be done. If passno is 2, then this
+ * is a revision to a previous pass which must be undone as the
+ * replacement pass is done.
+ */
+static int
+cgaccount1(int cg, struct vnode *vp, void *data, int passno)
+{
+	struct buf *bp, *ibp;
+	struct inode *ip;
+	struct cg *cgp;
+	struct fs *fs;
+	struct lwp *l = curlwp;
+	daddr_t base, numblks;
+	int error, len, loc, ns, indiroff;
+
+	ip = VTOI(vp);
+	fs = ip->i_fs;
+	ns = UFS_FSNEEDSWAP(fs);
+	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+		(int)fs->fs_cgsize, l->l_cred, 0, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return (error);
+	}
+	cgp = (struct cg *)bp->b_data;
+	if (!cg_chkmagic(cgp, ns)) {
+		brelse(bp, 0);
+		return (EIO);
+	}
+	ACTIVECG_SET(fs, cg);
+
+	memcpy(data, bp->b_data, fs->fs_cgsize);
+	brelse(bp, 0);
+	if (fs->fs_cgsize < fs->fs_bsize)
+		memset((char *)data + fs->fs_cgsize, 0,
+		    fs->fs_bsize - fs->fs_cgsize);
+	numblks = howmany(fs->fs_size, fs->fs_frag);
+	len = howmany(fs->fs_fpg, fs->fs_frag);
+	base = cg * fs->fs_fpg / fs->fs_frag;
+	if (base + len >= numblks)
+		len = numblks - base - 1;
+	loc = 0;
+	if (base < NDADDR) {
+		for ( ; loc < NDADDR; loc++) {
+			if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
+				db_assign(ip, loc, BLK_NOCOPY);
+			else if (db_get(ip, loc) == BLK_NOCOPY) {
+				if (passno == 2)
+					db_assign(ip, loc, 0);
+				else if (passno == 1)
+					panic("ffs_snapshot: lost direct block");
+			}
+		}
+	}
+	if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)),
+	    fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
+		return (error);
+	indiroff = (base + loc - NDADDR) % NINDIR(fs);
+	for ( ; loc < len; loc++, indiroff++) {
+		if (indiroff >= NINDIR(fs)) {
+			bawrite(ibp);
+			if ((error = ffs_balloc(vp,
+			    lblktosize(fs, (off_t)(base + loc)),
+			    fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
+				return (error);
+			indiroff = 0;
+		}
+		if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
+			idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
+		else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
+			if (passno == 2)
+				idb_assign(ip, ibp->b_data, indiroff, 0);
+			else if (passno == 1)
+				panic("ffs_snapshot: lost indirect block");
+		}
+	}
+	bdwrite(ibp);
+	return (0);
+}
+
+/*
+ * Before expunging a snapshot inode, note all the
+ * blocks that it claims with BLK_SNAP so that fsck will
+ * be able to account for those blocks properly and so
+ * that this snapshot knows that it need not copy them
+ * if the other snapshot holding them is freed.
+ */
+static int
+expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
+    acctfunc_t acctfunc, int expungetype)
+{
+	int i, error, ns;
+	daddr_t lbn, rlbn;
+	daddr_t len, blkno, numblks, blksperindir;
+	struct ufs1_dinode *dip1;
+	struct ufs2_dinode *dip2;
+	struct lwp *l = curlwp;
+	void *bap;
+	struct buf *bp;
+	struct mount *mp;
+
+	ns = UFS_FSNEEDSWAP(fs);
+	mp = snapvp->v_mount;
+
+	error = UFS_WAPBL_BEGIN(mp);
+	if (error)
+		return error;
+	/*
+	 * Prepare to expunge the inode. If its inode block has not
+	 * yet been copied, then allocate and fill the copy.
+	 */
+	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
+	error = snapblkaddr(snapvp, lbn, &blkno);
+	if (error)
+		return error;
+	if (blkno != 0) {
+		error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred,
+		    B_MODIFY, &bp);
+	} else {
+		error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn),
+		    fs->fs_bsize, l->l_cred, 0, &bp);
+		if (! error)
+			error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
+	}
+	if (error) {
+		UFS_WAPBL_END(mp);
+		return error;
+	}
+	/*
+	 * Set a snapshot inode to be a zero length file, regular files
+	 * or unlinked snapshots to be completely unallocated.
+	 */
+	if (fs->fs_magic == FS_UFS1_MAGIC) {
+		dip1 = (struct ufs1_dinode *)bp->b_data +
+		    ino_to_fsbo(fs, cancelip->i_number);
+		if (cancelip->i_flags & SF_SNAPSHOT) {
+			dip1->di_flags =
+			    ufs_rw32(ufs_rw32(dip1->di_flags, ns) |
+			    SF_SNAPINVAL, ns);
+		}
+		if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
+			dip1->di_mode = 0;
+		dip1->di_size = 0;
+		dip1->di_blocks = 0;
+		memset(&dip1->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int32_t));
+	} else {
+		dip2 = (struct ufs2_dinode *)bp->b_data +
+		    ino_to_fsbo(fs, cancelip->i_number);
+		if (cancelip->i_flags & SF_SNAPSHOT) {
+			dip2->di_flags =
+			    ufs_rw32(ufs_rw32(dip2->di_flags, ns) |
+			    SF_SNAPINVAL, ns);
+		}
+		if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
+			dip2->di_mode = 0;
+		dip2->di_size = 0;
+		dip2->di_blocks = 0;
+		memset(&dip2->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int64_t));
+	}
+	bdwrite(bp);
+	UFS_WAPBL_END(mp);
+	/*
+	 * Now go through and expunge all the blocks in the file
+	 * using the function requested.
+	 */
+	numblks = howmany(cancelip->i_size, fs->fs_bsize);
+	if (fs->fs_magic == FS_UFS1_MAGIC)
+		bap = &cancelip->i_ffs1_db[0];
+	else
+		bap = &cancelip->i_ffs2_db[0];
+	error = (*acctfunc)(snapvp, bap, 0, NDADDR, fs, 0, expungetype);
+	if (error)
+		return (error);
+	if (fs->fs_magic == FS_UFS1_MAGIC)
+		bap = &cancelip->i_ffs1_ib[0];
+	else
+		bap = &cancelip->i_ffs2_ib[0];
+	error = (*acctfunc)(snapvp, bap, 0, NIADDR, fs, -1, expungetype);
+	if (error)
+		return (error);
+	blksperindir = 1;
+	lbn = -NDADDR;
+	len = numblks - NDADDR;
+	rlbn = NDADDR;
+	for (i = 0; len > 0 && i < NIADDR; i++) {
+		error = indiracct(snapvp, ITOV(cancelip), i,
+		    ib_get(cancelip, i), lbn, rlbn, len,
+		    blksperindir, fs, acctfunc, expungetype);
+		if (error)
+			return (error);
+		blksperindir *= NINDIR(fs);
+		lbn -= blksperindir + 1;
+		len -= blksperindir;
+		rlbn += blksperindir;
+	}
+	return (0);
+}
+
+/*
+ * Descend an indirect block chain for vnode cancelvp accounting for all
+ * its indirect blocks in snapvp.
+ */
+static int
+indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
+    daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
+    daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
+{
+	int error, num, i;
+	daddr_t subblksperindir;
+	struct indir indirs[NIADDR + 2];
+	daddr_t last;
+	void *bap;
+	struct buf *bp;
+
+	if (blkno == 0) {
+		if (expungetype == BLK_NOCOPY)
+			return (0);
+		panic("indiracct: missing indir");
+	}
+	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
+		return (error);
+	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
+		panic("indiracct: botched params");
+	/*
+	 * We have to expand bread here since it will deadlock looking
+	 * up the block number for any blocks that are not in the cache.
+	 */
+	error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize,
+	    false, &bp);
+	if (error)
+		return error;
+	if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
+	    rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) {
+		brelse(bp, 0);
+		return (error);
+	}
+	/*
+	 * Account for the block pointers in this indirect block.
+	 */
+	last = howmany(remblks, blksperindir);
+	if (last > NINDIR(fs))
+		last = NINDIR(fs);
+	bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
+	memcpy((void *)bap, bp->b_data, fs->fs_bsize);
+	brelse(bp, 0);
+	error = (*acctfunc)(snapvp, bap, 0, last,
+	    fs, level == 0 ? rlbn : -1, expungetype);
+	if (error || level == 0)
+		goto out;
+	/*
+	 * Account for the block pointers in each of the indirect blocks
+	 * in the levels below us.
+	 */
+	subblksperindir = blksperindir / NINDIR(fs);
+	for (lbn++, level--, i = 0; i < last; i++) {
+		error = indiracct(snapvp, cancelvp, level,
+		    idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
+		    subblksperindir, fs, acctfunc, expungetype);
+		if (error)
+			goto out;
+		rlbn += blksperindir;
+		lbn -= blksperindir;
+		remblks -= blksperindir;
+	}
+out:
+	free(bap, M_DEVBUF);
+	return (error);
+}
+
+/*
+ * Do both snap accounting and map accounting.
+ */
+static int
+fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
+    struct fs *fs, daddr_t lblkno,
+    int exptype /* BLK_SNAP or BLK_NOCOPY */)
+{
+	int error;
+
+	if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
+		return (error);
+	return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
+}
+
+/*
+ * Identify a set of blocks allocated in a snapshot inode.
+ */
+static int
+snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
+    struct fs *fs, daddr_t lblkno,
+    int expungetype /* BLK_SNAP or BLK_NOCOPY */)
+{
+	struct inode *ip = VTOI(vp);
+	struct lwp *l = curlwp;
+	struct mount *mp = vp->v_mount;
+	daddr_t blkno;
+	daddr_t lbn;
+	struct buf *ibp;
+	int error, n;
+	const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
+
+	error = UFS_WAPBL_BEGIN(mp);
+	if (error)
+		return error;
+	for ( n = 0; oldblkp < lastblkp; oldblkp++) {
+		blkno = idb_get(ip, bap, oldblkp);
+		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
+			continue;
+		lbn = fragstoblks(fs, blkno);
+		if (lbn < NDADDR) {
+			blkno = db_get(ip, lbn);
+			ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		} else {
+			error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
+			    fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
+			if (error)
+				break;
+			blkno = idb_get(ip, ibp->b_data,
+			    (lbn - NDADDR) % NINDIR(fs));
+		}
+		/*
+		 * If we are expunging a snapshot vnode and we
+		 * find a block marked BLK_NOCOPY, then it is
+		 * one that has been allocated to this snapshot after
+		 * we took our current snapshot and can be ignored.
+		 */
+		if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
+			if (lbn >= NDADDR)
+				brelse(ibp, 0);
+		} else {
+			if (blkno != 0)
+				panic("snapacct: bad block");
+			if (lbn < NDADDR)
+				db_assign(ip, lbn, expungetype);
+			else {
+				idb_assign(ip, ibp->b_data,
+				    (lbn - NDADDR) % NINDIR(fs), expungetype);
+				bdwrite(ibp);
+			}
+		}
+		if (wbreak > 0 && (++n % wbreak) == 0) {
+			UFS_WAPBL_END(mp);
+			error = UFS_WAPBL_BEGIN(mp);
+			if (error)
+				return error;
+		}
+	}
+	UFS_WAPBL_END(mp);
+	return error;
+}
+
+/*
+ * Account for a set of blocks allocated in a snapshot inode.
+ */
+static int
+mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
+    struct fs *fs, daddr_t lblkno, int expungetype)
+{
+	daddr_t blkno;
+	struct inode *ip;
+	struct mount *mp = vp->v_mount;
+	ino_t inum;
+	int acctit, error, n;
+	const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
+
+	error = UFS_WAPBL_BEGIN(mp);
+	if (error)
+		return error;
+	ip = VTOI(vp);
+	inum = ip->i_number;
+	if (lblkno == -1)
+		acctit = 0;
+	else
+		acctit = 1;
+	for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
+		blkno = idb_get(ip, bap, oldblkp);
+		if (blkno == 0 || blkno == BLK_NOCOPY)
+			continue;
+		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
+			*ip->i_snapblklist++ = lblkno;
+		if (blkno == BLK_SNAP)
+			blkno = blkstofrags(fs, lblkno);
+		ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
+		if (wbreak > 0 && (++n % wbreak) == 0) {
+			UFS_WAPBL_END(mp);
+			error = UFS_WAPBL_BEGIN(mp);
+			if (error)
+				return error;
+		}
+	}
+	UFS_WAPBL_END(mp);
+	return (0);
+}
+
+/*
+ * Number of blocks that fit into the journal or zero if not logging.
+ */
+static int
+blocks_in_journal(struct fs *fs)
+{
+	off_t bpj;
+
+	if ((fs->fs_flags & FS_DOWAPBL) == 0)
+		return 0;
+	bpj = 1;
+	if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
+		switch (fs->fs_journal_location) {
+		case UFS_WAPBL_JOURNALLOC_END_PARTITION:
+			bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
+			    fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
+			break;
+		case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
+			bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
+			    fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
+			break;
+		}
+	}
+	bpj /= fs->fs_bsize;
+	return (bpj > 0 ? bpj : 1);
+}
+#endif /* defined(FFS_NO_SNAPSHOT) */
+
+/*
+ * Decrement extra reference on snapshot when last name is removed.
+ * It will not be freed until the last open reference goes away.
+ */
+void
+ffs_snapgone(struct inode *ip)
+{
+	struct mount *mp = ip->i_devvp->v_specmountpoint;
+	struct inode *xp;
+	struct fs *fs;
+	struct snap_info *si;
+	int snaploc;
+
+	si = VFSTOUFS(mp)->um_snapinfo;
+
+	/*
+	 * Find snapshot in incore list.
+	 */
+	mutex_enter(&si->si_lock);
+	TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
+		if (xp == ip)
+			break;
+	mutex_exit(&si->si_lock);
+	if (xp != NULL)
+		vrele(ITOV(ip));
+#ifdef DEBUG
+	else if (snapdebug)
+		printf("ffs_snapgone: lost snapshot vnode %llu\n",
+		    (unsigned long long)ip->i_number);
+#endif
+	/*
+	 * Delete snapshot inode from superblock. Keep list dense.
+	 */
+	mutex_enter(&si->si_lock);
+	fs = ip->i_fs;
+	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
+		if (fs->fs_snapinum[snaploc] == ip->i_number)
+			break;
+	if (snaploc < FSMAXSNAP) {
+		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
+			if (fs->fs_snapinum[snaploc] == 0)
+				break;
+			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
+		}
+		fs->fs_snapinum[snaploc - 1] = 0;
+	}
+	si->si_gen++;
+	mutex_exit(&si->si_lock);
+}
+
+/*
+ * Prepare a snapshot file for being removed.
+ */
+void
+ffs_snapremove(struct vnode *vp)
+{
+	struct inode *ip = VTOI(vp), *xp;
+	struct vnode *devvp = ip->i_devvp;
+	struct fs *fs = ip->i_fs;
+	struct mount *mp = devvp->v_specmountpoint;
+	struct buf *ibp;
+	struct snap_info *si;
+	struct lwp *l = curlwp;
+	daddr_t numblks, blkno, dblk;
+	int error, loc, last;
+
+	si = VFSTOUFS(mp)->um_snapinfo;
+	/*
+	 * If active, delete from incore list (this snapshot may
+	 * already have been in the process of being deleted, so
+	 * would not have been active).
+	 *
+	 * Clear copy-on-write flag if last snapshot.
+	 */
+	mutex_enter(&si->si_snaplock);
+	mutex_enter(&si->si_lock);
+	if (is_active_snapshot(si, ip)) {
+		TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
+		if (TAILQ_FIRST(&si->si_snapshots) != 0) {
+			/* Roll back the list of preallocated blocks. */
+			xp = TAILQ_LAST(&si->si_snapshots, inodelst);
+			si->si_snapblklist = xp->i_snapblklist;
+			si->si_gen++;
+			mutex_exit(&si->si_lock);
+			mutex_exit(&si->si_snaplock);
+		} else {
+			si->si_snapblklist = 0;
+			si->si_gen++;
+			mutex_exit(&si->si_lock);
+			mutex_exit(&si->si_snaplock);
+			fscow_disestablish(mp, ffs_copyonwrite, devvp);
+		}
+		if (ip->i_snapblklist != NULL) {
+			free(ip->i_snapblklist, M_UFSMNT);
+			ip->i_snapblklist = NULL;
+		}
+	} else {
+		mutex_exit(&si->si_lock);
+		mutex_exit(&si->si_snaplock);
+	}
+	/*
+	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
+	 * snapshots that want them (see ffs_snapblkfree below).
+	 */
+	for (blkno = 1; blkno < NDADDR; blkno++) {
+		dblk = db_get(ip, blkno);
+		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
+			db_assign(ip, blkno, 0);
+		else if ((dblk == blkstofrags(fs, blkno) &&
+		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
+		     ip->i_number))) {
+			DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
+			db_assign(ip, blkno, 0);
+		}
+	}
+	numblks = howmany(ip->i_size, fs->fs_bsize);
+	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
+		error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
+		    fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
+		if (error)
+			continue;
+		if (fs->fs_size - blkno > NINDIR(fs))
+			last = NINDIR(fs);
+		else
+			last = fs->fs_size - blkno;
+		for (loc = 0; loc < last; loc++) {
+			dblk = idb_get(ip, ibp->b_data, loc);
+			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
+				idb_assign(ip, ibp->b_data, loc, 0);
+			else if (dblk == blkstofrags(fs, blkno) &&
+			    ffs_snapblkfree(fs, ip->i_devvp, dblk,
+			    fs->fs_bsize, ip->i_number)) {
+				DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
+				idb_assign(ip, ibp->b_data, loc, 0);
+			}
+		}
+		bawrite(ibp);
+		UFS_WAPBL_END(mp);
+		error = UFS_WAPBL_BEGIN(mp);
+		KASSERT(error == 0);
+	}
+	/*
+	 * Clear snapshot flag and drop reference.
+	 */
+	ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL);
+	DIP_ASSIGN(ip, flags, ip->i_flags);
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+#if defined(QUOTA) || defined(QUOTA2)
+	chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE);
+	chkiq(ip, 1, l->l_cred, FORCE);
+#endif
+}
+
+/*
+ * Notification that a block is being freed. Return zero if the free
+ * should be allowed to proceed. Return non-zero if the snapshot file
+ * wants to claim the block. The block will be claimed if it is an
+ * uncopied part of one of the snapshots. It will be freed if it is
+ * either a BLK_NOCOPY or has already been copied in all of the snapshots.
+ * If a fragment is being freed, then all snapshots that care about
+ * it must make a copy since a snapshot file can only claim full sized
+ * blocks. Note that if more than one snapshot file maps the block,
+ * we can pick one at random to claim it. Since none of the snapshots
+ * can change, we are assurred that they will all see the same unmodified
+ * image. When deleting a snapshot file (see ffs_snapremove above), we
+ * must push any of these claimed blocks to one of the other snapshots
+ * that maps it. These claimed blocks are easily identified as they will
+ * have a block number equal to their logical block number within the
+ * snapshot. A copied block can never have this property because they
+ * must always have been allocated from a BLK_NOCOPY location.
+ */
+int
+ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
+    long size, ino_t inum)
+{
+	struct mount *mp = devvp->v_specmountpoint;
+	struct buf *ibp;
+	struct inode *ip;
+	struct vnode *vp = NULL;
+	struct snap_info *si;
+	void *saved_data = NULL;
+	daddr_t lbn;
+	daddr_t blkno;
+	uint32_t gen;
+	int indiroff = 0, error = 0, claimedblk = 0;
+
+	si = VFSTOUFS(mp)->um_snapinfo;
+	lbn = fragstoblks(fs, bno);
+	mutex_enter(&si->si_snaplock);
+	mutex_enter(&si->si_lock);
+	si->si_owner = curlwp;
+		
+retry:
+	gen = si->si_gen;
+	TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
+		vp = ITOV(ip);
+		/*
+		 * Lookup block being written.
+		 */
+		if (lbn < NDADDR) {
+			blkno = db_get(ip, lbn);
+		} else {
+			mutex_exit(&si->si_lock);
+			error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
+			    fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
+			if (error) {
+				mutex_enter(&si->si_lock);
+				break;
+			}
+			indiroff = (lbn - NDADDR) % NINDIR(fs);
+			blkno = idb_get(ip, ibp->b_data, indiroff);
+			mutex_enter(&si->si_lock);
+			if (gen != si->si_gen) {
+				brelse(ibp, 0);
+				goto retry;
+			}
+		}
+		/*
+		 * Check to see if block needs to be copied.
+		 */
+		if (blkno == 0) {
+			/*
+			 * A block that we map is being freed. If it has not
+			 * been claimed yet, we will claim or copy it (below).
+			 */
+			claimedblk = 1;
+		} else if (blkno == BLK_SNAP) {
+			/*
+			 * No previous snapshot claimed the block,
+			 * so it will be freed and become a BLK_NOCOPY
+			 * (don't care) for us.
+			 */
+			if (claimedblk)
+				panic("snapblkfree: inconsistent block type");
+			if (lbn < NDADDR) {
+				db_assign(ip, lbn, BLK_NOCOPY);
+				ip->i_flag |= IN_CHANGE | IN_UPDATE;
+			} else {
+				idb_assign(ip, ibp->b_data, indiroff,
+				    BLK_NOCOPY);
+				mutex_exit(&si->si_lock);
+				if (ip->i_nlink > 0)
+					bwrite(ibp);
+				else
+					bdwrite(ibp);
+				mutex_enter(&si->si_lock);
+				if (gen != si->si_gen)
+					goto retry;
+			}
+			continue;
+		} else /* BLK_NOCOPY or default */ {
+			/*
+			 * If the snapshot has already copied the block
+			 * (default), or does not care about the block,
+			 * it is not needed.
+			 */
+			if (lbn >= NDADDR)
+				brelse(ibp, 0);
+			continue;
+		}
+		/*
+		 * If this is a full size block, we will just grab it
+		 * and assign it to the snapshot inode. Otherwise we
+		 * will proceed to copy it. See explanation for this
+		 * routine as to why only a single snapshot needs to
+		 * claim this block.
+		 */
+		if (size == fs->fs_bsize) {
+#ifdef DEBUG
+			if (snapdebug)
+				printf("%s %llu lbn %" PRId64
+				    "from inum %llu\n",
+				    "Grabonremove: snapino",
+				    (unsigned long long)ip->i_number,
+				    lbn, (unsigned long long)inum);
+#endif
+			mutex_exit(&si->si_lock);
+			if (lbn < NDADDR) {
+				db_assign(ip, lbn, bno);
+			} else {
+				idb_assign(ip, ibp->b_data, indiroff, bno);
+				if (ip->i_nlink > 0)
+					bwrite(ibp);
+				else
+					bdwrite(ibp);
+			}
+			DIP_ADD(ip, blocks, btodb(size));
+			ip->i_flag |= IN_CHANGE | IN_UPDATE;
+			if (ip->i_nlink > 0 && mp->mnt_wapbl)
+				error = syncsnap(vp);
+			else
+				error = 0;
+			mutex_enter(&si->si_lock);
+			si->si_owner = NULL;
+			mutex_exit(&si->si_lock);
+			mutex_exit(&si->si_snaplock);
+			return (error == 0);
+		}
+		if (lbn >= NDADDR)
+			brelse(ibp, 0);
+#ifdef DEBUG
+		if (snapdebug)
+			printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
+			    "Copyonremove: snapino ",
+			    (unsigned long long)ip->i_number,
+			    lbn, "for inum", (unsigned long long)inum, size);
+#endif
+		/*
+		 * If we have already read the old block contents, then
+		 * simply copy them to the new block. Note that we need
+		 * to synchronously write snapshots that have not been
+		 * unlinked, and hence will be visible after a crash,
+		 * to ensure their integrity.
+		 */
+		mutex_exit(&si->si_lock);
+		if (saved_data == NULL) {
+			saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
+			error = rwfsblk(vp, B_READ, saved_data, lbn);
+			if (error) {
+				free(saved_data, M_UFSMNT);
+				saved_data = NULL;
+				mutex_enter(&si->si_lock);
+				break;
+			}
+		}
+		error = wrsnapblk(vp, saved_data, lbn);
+		if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
+			error = syncsnap(vp);
+		mutex_enter(&si->si_lock);
+		if (error)
+			break;
+		if (gen != si->si_gen)
+			goto retry;
+	}
+	si->si_owner = NULL;
+	mutex_exit(&si->si_lock);
+	mutex_exit(&si->si_snaplock);
+	if (saved_data)
+		free(saved_data, M_UFSMNT);
+	/*
+	 * If we have been unable to allocate a block in which to do
+	 * the copy, then return non-zero so that the fragment will
+	 * not be freed. Although space will be lost, the snapshot
+	 * will stay consistent.
+	 */
+	return (error);
+}
+
+/*
+ * Associate snapshot files when mounting.
+ */
+void
+ffs_snapshot_mount(struct mount *mp)
+{
+	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
+	struct fs *fs = VFSTOUFS(mp)->um_fs;
+	struct lwp *l = curlwp;
+	struct vnode *vp;
+	struct inode *ip, *xp;
+	struct snap_info *si;
+	daddr_t snaplistsize, *snapblklist;
+	int i, error, ns, snaploc, loc;
+
+	/*
+	 * No persistent snapshots on apple ufs file systems.
+	 */
+	if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
+		return;
+
+	si = VFSTOUFS(mp)->um_snapinfo;
+	ns = UFS_FSNEEDSWAP(fs);
+	/*
+	 * XXX The following needs to be set before ffs_truncate or
+	 * VOP_READ can be called.
+	 */
+	mp->mnt_stat.f_iosize = fs->fs_bsize;
+	/*
+	 * Process each snapshot listed in the superblock.
+	 */
+	vp = NULL;
+	mutex_enter(&si->si_lock);
+	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
+		if (fs->fs_snapinum[snaploc] == 0)
+			break;
+		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
+		    &vp)) != 0) {
+			printf("ffs_snapshot_mount: vget failed %d\n", error);
+			continue;
+		}
+		ip = VTOI(vp);
+		if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
+		    SF_SNAPSHOT) {
+			printf("ffs_snapshot_mount: non-snapshot inode %d\n",
+			    fs->fs_snapinum[snaploc]);
+			vput(vp);
+			vp = NULL;
+			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
+				if (fs->fs_snapinum[loc] == 0)
+					break;
+				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
+			}
+			fs->fs_snapinum[loc - 1] = 0;
+			snaploc--;
+			continue;
+		}
+
+		/*
+		 * Read the block hints list. Use an empty list on
+		 * read errors.
+		 */
+		error = vn_rdwr(UIO_READ, vp,
+		    (void *)&snaplistsize, sizeof(snaplistsize),
+		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
+		    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
+		    l->l_cred, NULL, NULL);
+		if (error) {
+			printf("ffs_snapshot_mount: read_1 failed %d\n", error);
+			snaplistsize = 1;
+		} else
+			snaplistsize = ufs_rw64(snaplistsize, ns);
+		snapblklist = malloc(
+		    snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
+		if (error)
+			snapblklist[0] = 1;
+		else {
+			error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
+			    snaplistsize * sizeof(daddr_t),
+			    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
+			    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
+			    l->l_cred, NULL, NULL);
+			for (i = 0; i < snaplistsize; i++)
+				snapblklist[i] = ufs_rw64(snapblklist[i], ns);
+			if (error) {
+				printf("ffs_snapshot_mount: read_2 failed %d\n",
+				    error);
+				snapblklist[0] = 1;
+			}
+		}
+		ip->i_snapblklist = &snapblklist[0];
+
+		/*
+		 * Link it onto the active snapshot list.
+		 */
+		if (is_active_snapshot(si, ip))
+			panic("ffs_snapshot_mount: %"PRIu64" already on list",
+			    ip->i_number);
+		else
+			TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
+		vp->v_vflag |= VV_SYSTEM;
+		VOP_UNLOCK(vp);
+	}
+	/*
+	 * No usable snapshots found.
+	 */
+	if (vp == NULL) {
+		mutex_exit(&si->si_lock);
+		return;
+	}
+	/*
+	 * Attach the block hints list. We always want to
+	 * use the list from the newest snapshot.
+	*/
+	xp = TAILQ_LAST(&si->si_snapshots, inodelst);
+	si->si_snapblklist = xp->i_snapblklist;
+	fscow_establish(mp, ffs_copyonwrite, devvp);
+	si->si_gen++;
+	mutex_exit(&si->si_lock);
+}
+
+/*
+ * Disassociate snapshot files when unmounting.
+ */
+void
+ffs_snapshot_unmount(struct mount *mp)
+{
+	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
+	struct inode *xp;
+	struct vnode *vp = NULL;
+	struct snap_info *si;
+
+	si = VFSTOUFS(mp)->um_snapinfo;
+	mutex_enter(&si->si_lock);
+	while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
+		vp = ITOV(xp);
+		TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
+		if (xp->i_snapblklist == si->si_snapblklist)
+			si->si_snapblklist = NULL;
+		free(xp->i_snapblklist, M_UFSMNT);
+		if (xp->i_nlink > 0) {
+			si->si_gen++;
+			mutex_exit(&si->si_lock);
+			vrele(vp);
+			mutex_enter(&si->si_lock);
+		}
+	}
+	si->si_gen++;
+	mutex_exit(&si->si_lock);
+	if (vp)
+		fscow_disestablish(mp, ffs_copyonwrite, devvp);
+}
+
+/*
+ * Check for need to copy block that is about to be written,
+ * copying the block if necessary.
+ */
+static int
+ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
+{
+	struct fs *fs;
+	struct inode *ip;
+	struct vnode *devvp = v, *vp = NULL;
+	struct mount *mp = devvp->v_specmountpoint;
+	struct snap_info *si;
+	void *saved_data = NULL;
+	daddr_t lbn, blkno, *snapblklist;
+	uint32_t gen;
+	int lower, upper, mid, snapshot_locked = 0, error = 0;
+
+	/*
+	 * Check for valid snapshots.
+	 */
+	si = VFSTOUFS(mp)->um_snapinfo;
+	mutex_enter(&si->si_lock);
+	ip = TAILQ_FIRST(&si->si_snapshots);
+	if (ip == NULL) {
+		mutex_exit(&si->si_lock);
+		return 0;
+	}
+	/*
+	 * First check to see if it is after the file system,
+	 * in the journal or in the preallocated list.
+	 * By doing these checks we avoid several potential deadlocks.
+	 */
+	fs = ip->i_fs;
+	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
+	if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) {
+		mutex_exit(&si->si_lock);
+		return 0;
+	}
+	if ((fs->fs_flags & FS_DOWAPBL) &&
+	    fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
+		off_t blk_off, log_start, log_end;
+
+		log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] *
+		    fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
+		log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] *
+		    fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
+		blk_off = dbtob(bp->b_blkno);
+		if (blk_off >= log_start && blk_off < log_end) {
+			mutex_exit(&si->si_lock);
+			return 0;
+		}
+	}
+	snapblklist = si->si_snapblklist;
+	upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
+	lower = 1;
+	while (lower <= upper) {
+		mid = (lower + upper) / 2;
+		if (snapblklist[mid] == lbn)
+			break;
+		if (snapblklist[mid] < lbn)
+			lower = mid + 1;
+		else
+			upper = mid - 1;
+	}
+	if (lower <= upper) {
+		mutex_exit(&si->si_lock);
+		return 0;
+	}
+	/*
+	 * Not in the precomputed list, so check the snapshots.
+	 */
+	 if (si->si_owner != curlwp) {
+		if (!mutex_tryenter(&si->si_snaplock)) {
+			mutex_exit(&si->si_lock);
+			mutex_enter(&si->si_snaplock);
+			mutex_enter(&si->si_lock);
+		}
+		si->si_owner = curlwp;
+		snapshot_locked = 1;
+	 }
+	 if (data_valid && bp->b_bcount == fs->fs_bsize)
+		saved_data = bp->b_data;
+retry:
+	gen = si->si_gen;
+	TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
+		vp = ITOV(ip);
+		/*
+		 * We ensure that everything of our own that needs to be
+		 * copied will be done at the time that ffs_snapshot is
+		 * called. Thus we can skip the check here which can
+		 * deadlock in doing the lookup in ffs_balloc.
+		 */
+		if (bp->b_vp == vp)
+			continue;
+		/*
+		 * Check to see if block needs to be copied.
+		 */
+		if (lbn < NDADDR) {
+			blkno = db_get(ip, lbn);
+		} else {
+			mutex_exit(&si->si_lock);
+			blkno = 0; /* XXX: GCC */
+			if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
+				mutex_enter(&si->si_lock);
+				break;
+			}
+			mutex_enter(&si->si_lock);
+			if (gen != si->si_gen)
+				goto retry;
+		}
+#ifdef DIAGNOSTIC
+		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
+			panic("ffs_copyonwrite: bad copy block");
+#endif
+		if (blkno != 0)
+			continue;
+
+		if (curlwp == uvm.pagedaemon_lwp) {
+			error = ENOMEM;
+			break;
+		}
+		/* Only one level of recursion allowed. */
+		KASSERT(snapshot_locked);
+		/*
+		 * Allocate the block into which to do the copy. Since
+		 * multiple processes may all try to copy the same block,
+		 * we have to recheck our need to do a copy if we sleep
+		 * waiting for the lock.
+		 *
+		 * Because all snapshots on a filesystem share a single
+		 * lock, we ensure that we will never be in competition
+		 * with another process to allocate a block.
+		 */
+#ifdef DEBUG
+		if (snapdebug) {
+			printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
+			    (unsigned long long)ip->i_number, lbn);
+			if (bp->b_vp == devvp)
+				printf("fs metadata");
+			else
+				printf("inum %llu", (unsigned long long)
+				    VTOI(bp->b_vp)->i_number);
+			printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
+		}
+#endif
+		/*
+		 * If we have already read the old block contents, then
+		 * simply copy them to the new block. Note that we need
+		 * to synchronously write snapshots that have not been
+		 * unlinked, and hence will be visible after a crash,
+		 * to ensure their integrity.
+		 */
+		mutex_exit(&si->si_lock);
+		if (saved_data == NULL) {
+			saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
+			error = rwfsblk(vp, B_READ, saved_data, lbn);
+			if (error) {
+				free(saved_data, M_UFSMNT);
+				saved_data = NULL;
+				mutex_enter(&si->si_lock);
+				break;
+			}
+		}
+		error = wrsnapblk(vp, saved_data, lbn);
+		if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
+			error = syncsnap(vp);
+		mutex_enter(&si->si_lock);
+		if (error)
+			break;
+		if (gen != si->si_gen)
+			goto retry;
+	}
+	/*
+	 * Note that we need to synchronously write snapshots that
+	 * have not been unlinked, and hence will be visible after
+	 * a crash, to ensure their integrity.
+	 */
+	if (snapshot_locked) {
+		si->si_owner = NULL;
+		mutex_exit(&si->si_lock);
+		mutex_exit(&si->si_snaplock);
+	} else
+		mutex_exit(&si->si_lock);
+	if (saved_data && saved_data != bp->b_data)
+		free(saved_data, M_UFSMNT);
+	return error;
+}
+
+/*
+ * Read from a snapshot.
+ */
+int
+ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
+{
+	struct inode *ip = VTOI(vp);
+	struct fs *fs = ip->i_fs;
+	struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
+	struct buf *bp;
+	daddr_t lbn, nextlbn;
+	off_t fsbytes, bytesinfile;
+	long size, xfersize, blkoffset;
+	int error;
+
+	fstrans_start(vp->v_mount, FSTRANS_SHARED);
+	mutex_enter(&si->si_snaplock);
+
+	if (ioflag & IO_ALTSEMANTICS)
+		fsbytes = ip->i_size;
+	else
+		fsbytes = lfragtosize(fs, fs->fs_size);
+	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
+		bytesinfile = fsbytes - uio->uio_offset;
+		if (bytesinfile <= 0)
+			break;
+		lbn = lblkno(fs, uio->uio_offset);
+		nextlbn = lbn + 1;
+		size = fs->fs_bsize;
+		blkoffset = blkoff(fs, uio->uio_offset);
+		xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
+		    bytesinfile);
+
+		if (lblktosize(fs, nextlbn + 1) >= fsbytes) {
+			if (lblktosize(fs, lbn) + size > fsbytes)
+				size = fragroundup(fs,
+				    fsbytes - lblktosize(fs, lbn));
+			error = bread(vp, lbn, size, NOCRED, 0, &bp);
+		} else {
+			int nextsize = fs->fs_bsize;
+			error = breadn(vp, lbn,
+			    size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
+		}
+		if (error)
+			break;
+
+		/*
+		 * We should only get non-zero b_resid when an I/O error
+		 * has occurred, which should cause us to break above.
+		 * However, if the short read did not cause an error,
+		 * then we want to ensure that we do not uiomove bad
+		 * or uninitialized data.
+		 */
+		size -= bp->b_resid;
+		if (size < blkoffset + xfersize) {
+			xfersize = size - blkoffset;
+			if (xfersize <= 0)
+				break;
+		}
+		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+		if (error)
+			break;
+		brelse(bp, BC_AGE);
+	}
+	if (bp != NULL)
+		brelse(bp, BC_AGE);
+
+	mutex_exit(&si->si_snaplock);
+	fstrans_done(vp->v_mount);
+	return error;
+}
+
+/*
+ * Lookup a snapshots data block address.
+ * Simpler than UFS_BALLOC() as we know all metadata is already allocated
+ * and safe even for the pagedaemon where we cannot bread().
+ */
+static int
+snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
+{
+	struct indir indirs[NIADDR + 2];
+	struct inode *ip = VTOI(vp);
+	struct fs *fs = ip->i_fs;
+	struct buf *bp;
+	int error, num;
+
+	KASSERT(lbn >= 0);
+
+	if (lbn < NDADDR) {
+		*res = db_get(ip, lbn);
+		return 0;
+	}
+	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
+		return error;
+	if (curlwp == uvm.pagedaemon_lwp) {
+		mutex_enter(&bufcache_lock);
+		bp = incore(vp, indirs[num-1].in_lbn);
+		if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
+			*res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
+			error = 0;
+		} else
+			error = ENOMEM;
+		mutex_exit(&bufcache_lock);
+		return error;
+	}
+	error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp);
+	if (error == 0)
+		*res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
+	brelse(bp, 0);
+
+	return error;
+}
+
+/*
+ * Read or write the specified block of the filesystem vp resides on
+ * from or to the disk bypassing the buffer cache.
+ */
+static int
+rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
+{
+	int error;
+	struct inode *ip = VTOI(vp);
+	struct fs *fs = ip->i_fs;
+	struct buf *nbp;
+
+	nbp = getiobuf(NULL, true);
+	nbp->b_flags = flags;
+	nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
+	nbp->b_error = 0;
+	nbp->b_data = data;
+	nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn));
+	nbp->b_proc = NULL;
+	nbp->b_dev = ip->i_devvp->v_rdev;
+	SET(nbp->b_cflags, BC_BUSY);	/* mark buffer busy */
+
+	bdev_strategy(nbp);
+
+	error = biowait(nbp);
+
+	putiobuf(nbp);
+
+	return error;
+}
+
+/*
+ * Write all dirty buffers to disk and invalidate them.
+ */
+static int
+syncsnap(struct vnode *vp)
+{
+	int error;
+	buf_t *bp;
+	struct fs *fs = VTOI(vp)->i_fs;
+
+	mutex_enter(&bufcache_lock);
+	while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
+		error = bbusy(bp, false, 0, NULL);
+		if (error == EPASSTHROUGH)
+			continue;
+		else if (error != 0) {
+			mutex_exit(&bufcache_lock);
+			return error;
+		}
+		KASSERT(bp->b_bcount == fs->fs_bsize);
+		mutex_exit(&bufcache_lock);
+		error = rwfsblk(vp, B_WRITE, bp->b_data,
+		    fragstoblks(fs, dbtofsb(fs, bp->b_blkno)));
+		brelse(bp, BC_INVAL | BC_VFLUSH);
+		if (error)
+			return error;
+		mutex_enter(&bufcache_lock);
+	}
+	mutex_exit(&bufcache_lock);
+
+	return 0;
+}
+
+/*
+ * Write the specified block to a snapshot.
+ */
+static int
+wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
+{
+	struct inode *ip = VTOI(vp);
+	struct fs *fs = ip->i_fs;
+	struct buf *bp;
+	int error;
+
+	error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize,
+	    FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
+	if (error)
+		return error;
+	memcpy(bp->b_data, data, fs->fs_bsize);
+	if (ip->i_nlink > 0)
+		error = bwrite(bp);
+	else
+		bawrite(bp);
+
+	return error;
+}
+
+/*
+ * Check if this inode is present on the active snapshot list.
+ * Must be called with snapinfo locked.
+ */
+static inline bool
+is_active_snapshot(struct snap_info *si, struct inode *ip)
+{
+	struct inode *xp;
+
+	KASSERT(mutex_owned(&si->si_lock));
+
+	TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
+		if (xp == ip)
+			return true;
+	return false;
+}
+
+/*
+ * Get/Put direct block from inode or buffer containing disk addresses. Take
+ * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
+ * into a global include.
+ */
+static inline daddr_t
+db_get(struct inode *ip, int loc)
+{
+	if (ip->i_ump->um_fstype == UFS1)
+		return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
+	else
+		return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
+}
+
+static inline void
+db_assign(struct inode *ip, int loc, daddr_t val)
+{
+	if (ip->i_ump->um_fstype == UFS1)
+		ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
+	else
+		ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
+}
+
+static inline daddr_t
+ib_get(struct inode *ip, int loc)
+{
+	if (ip->i_ump->um_fstype == UFS1)
+		return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
+	else
+		return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
+}
+
+static inline void
+ib_assign(struct inode *ip, int loc, daddr_t val)
+{
+	if (ip->i_ump->um_fstype == UFS1)
+		ip->i_ffs1_ib[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
+	else
+		ip->i_ffs2_ib[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
+}
+
+static inline daddr_t
+idb_get(struct inode *ip, void *bf, int loc)
+{
+	if (ip->i_ump->um_fstype == UFS1)
+		return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
+	else
+		return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
+}
+
+static inline void
+idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
+{
+	if (ip->i_ump->um_fstype == UFS1)
+		((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
+	else
+		((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
+}
diff --git a/sys/ufs/ffs/ffs_subr.c b/sys/ufs/ffs/ffs_subr.c
new file mode 100644
index 000000000..6b6840357
--- /dev/null
+++ b/sys/ufs/ffs/ffs_subr.c
@@ -0,0 +1,371 @@
+/*	$NetBSD: ffs_subr.c,v 1.47 2011/08/14 12:37:09 christos Exp $	*/
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ffs_subr.c	8.5 (Berkeley) 3/21/95
+ */
+
+#if HAVE_NBTOOL_CONFIG_H
+#include "nbtool_config.h"
+#endif
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_subr.c,v 1.47 2011/08/14 12:37:09 christos Exp $");
+
+#include <sys/param.h>
+
+/* in ffs_tables.c */
+extern const int inside[], around[];
+extern const u_char * const fragtbl[];
+
+#ifndef _KERNEL
+#define FFS_EI /* always include byteswapped filesystems support */
+#endif
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+
+#ifndef _KERNEL
+#include <ufs/ufs/dinode.h>
+void    panic(const char *, ...)
+    __attribute__((__noreturn__,__format__(__printf__,1,2)));
+
+#else	/* _KERNEL */
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+#include <sys/inttypes.h>
+#include <sys/pool.h>
+#include <sys/fstrans.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+/*
+ * Load up the contents of an inode and copy the appropriate pieces
+ * to the incore copy.
+ */
+void
+ffs_load_inode(struct buf *bp, struct inode *ip, struct fs *fs, ino_t ino)
+{
+	struct ufs1_dinode *dp1;
+	struct ufs2_dinode *dp2;
+
+	if (ip->i_ump->um_fstype == UFS1) {
+		dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino);
+#ifdef FFS_EI
+		if (UFS_FSNEEDSWAP(fs))
+			ffs_dinode1_swap(dp1, ip->i_din.ffs1_din);
+		else
+#endif
+		*ip->i_din.ffs1_din = *dp1;
+
+		ip->i_mode = ip->i_ffs1_mode;
+		ip->i_nlink = ip->i_ffs1_nlink;
+		ip->i_size = ip->i_ffs1_size;
+		ip->i_flags = ip->i_ffs1_flags;
+		ip->i_gen = ip->i_ffs1_gen;
+		ip->i_uid = ip->i_ffs1_uid;
+		ip->i_gid = ip->i_ffs1_gid;
+	} else {
+		dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino);
+#ifdef FFS_EI
+		if (UFS_FSNEEDSWAP(fs))
+			ffs_dinode2_swap(dp2, ip->i_din.ffs2_din);
+		else
+#endif
+		*ip->i_din.ffs2_din = *dp2;
+
+		ip->i_mode = ip->i_ffs2_mode;
+		ip->i_nlink = ip->i_ffs2_nlink;
+		ip->i_size = ip->i_ffs2_size;
+		ip->i_flags = ip->i_ffs2_flags;
+		ip->i_gen = ip->i_ffs2_gen;
+		ip->i_uid = ip->i_ffs2_uid;
+		ip->i_gid = ip->i_ffs2_gid;
+	}
+}
+
+int
+ffs_getblk(struct vnode *vp, daddr_t lblkno, daddr_t blkno, int size,
+    bool clearbuf, buf_t **bpp)
+{
+	int error = 0;
+
+	KASSERT(blkno >= 0 || blkno == FFS_NOBLK);
+
+	if ((*bpp = getblk(vp, lblkno, size, 0, 0)) == NULL)
+		return ENOMEM;
+	if (blkno != FFS_NOBLK)
+		(*bpp)->b_blkno = blkno;
+	if (clearbuf)
+		clrbuf(*bpp);
+	if ((*bpp)->b_blkno >= 0 && (error = fscow_run(*bpp, false)) != 0)
+		brelse(*bpp, BC_INVAL);
+	return error;
+}
+
+#endif	/* _KERNEL */
+
+/*
+ * Update the frsum fields to reflect addition or deletion
+ * of some frags.
+ */
+void
+ffs_fragacct(struct fs *fs, int fragmap, int32_t fraglist[], int cnt,
+    int needswap)
+{
+	int inblk;
+	int field, subfield;
+	int siz, pos;
+
+	inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
+	fragmap <<= 1;
+	for (siz = 1; siz < fs->fs_frag; siz++) {
+		if ((inblk & (1 << (siz + (fs->fs_frag & (NBBY - 1))))) == 0)
+			continue;
+		field = around[siz];
+		subfield = inside[siz];
+		for (pos = siz; pos <= fs->fs_frag; pos++) {
+			if ((fragmap & field) == subfield) {
+				fraglist[siz] = ufs_rw32(
+				    ufs_rw32(fraglist[siz], needswap) + cnt,
+				    needswap);
+				pos += siz;
+				field <<= siz;
+				subfield <<= siz;
+			}
+			field <<= 1;
+			subfield <<= 1;
+		}
+	}
+}
+
+/*
+ * block operations
+ *
+ * check if a block is available
+ *  returns true if all the correponding bits in the free map are 1
+ *  returns false if any corresponding bit in the free map is 0
+ */
+int
+ffs_isblock(struct fs *fs, u_char *cp, int32_t h)
+{
+	u_char mask;
+
+	switch ((int)fs->fs_fragshift) {
+	case 3:
+		return (cp[h] == 0xff);
+	case 2:
+		mask = 0x0f << ((h & 0x1) << 2);
+		return ((cp[h >> 1] & mask) == mask);
+	case 1:
+		mask = 0x03 << ((h & 0x3) << 1);
+		return ((cp[h >> 2] & mask) == mask);
+	case 0:
+		mask = 0x01 << (h & 0x7);
+		return ((cp[h >> 3] & mask) == mask);
+	default:
+		panic("ffs_isblock: unknown fs_fragshift %d",
+		    (int)fs->fs_fragshift);
+	}
+}
+
+/*
+ * check if a block is completely allocated
+ *  returns true if all the corresponding bits in the free map are 0
+ *  returns false if any corresponding bit in the free map is 1
+ */
+int
+ffs_isfreeblock(struct fs *fs, u_char *cp, int32_t h)
+{
+
+	switch ((int)fs->fs_fragshift) {
+	case 3:
+		return (cp[h] == 0);
+	case 2:
+		return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
+	case 1:
+		return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
+	case 0:
+		return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
+	default:
+		panic("ffs_isfreeblock: unknown fs_fragshift %d",
+		    (int)fs->fs_fragshift);
+	}
+}
+
+/*
+ * take a block out of the map
+ */
+void
+ffs_clrblock(struct fs *fs, u_char *cp, int32_t h)
+{
+
+	switch ((int)fs->fs_fragshift) {
+	case 3:
+		cp[h] = 0;
+		return;
+	case 2:
+		cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
+		return;
+	case 1:
+		cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
+		return;
+	case 0:
+		cp[h >> 3] &= ~(0x01 << (h & 0x7));
+		return;
+	default:
+		panic("ffs_clrblock: unknown fs_fragshift %d",
+		    (int)fs->fs_fragshift);
+	}
+}
+
+/*
+ * put a block into the map
+ */
+void
+ffs_setblock(struct fs *fs, u_char *cp, int32_t h)
+{
+
+	switch ((int)fs->fs_fragshift) {
+	case 3:
+		cp[h] = 0xff;
+		return;
+	case 2:
+		cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
+		return;
+	case 1:
+		cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
+		return;
+	case 0:
+		cp[h >> 3] |= (0x01 << (h & 0x7));
+		return;
+	default:
+		panic("ffs_setblock: unknown fs_fragshift %d",
+		    (int)fs->fs_fragshift);
+	}
+}
+
+/*
+ * Update the cluster map because of an allocation or free.
+ *
+ * Cnt == 1 means free; cnt == -1 means allocating.
+ */
+void
+ffs_clusteracct(struct fs *fs, struct cg *cgp, int32_t blkno, int cnt)
+{
+	int32_t *sump;
+	int32_t *lp;
+	u_char *freemapp, *mapp;
+	int i, start, end, forw, back, map, bit;
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+	/* KASSERT(mutex_owned(&ump->um_lock)); */
+
+	if (fs->fs_contigsumsize <= 0)
+		return;
+	freemapp = cg_clustersfree(cgp, needswap);
+	sump = cg_clustersum(cgp, needswap);
+	/*
+	 * Allocate or clear the actual block.
+	 */
+	if (cnt > 0)
+		setbit(freemapp, blkno);
+	else
+		clrbit(freemapp, blkno);
+	/*
+	 * Find the size of the cluster going forward.
+	 */
+	start = blkno + 1;
+	end = start + fs->fs_contigsumsize;
+	if ((uint32_t)end >= ufs_rw32(cgp->cg_nclusterblks, needswap))
+		end = ufs_rw32(cgp->cg_nclusterblks, needswap);
+	mapp = &freemapp[start / NBBY];
+	map = *mapp++;
+	bit = 1 << (start % NBBY);
+	for (i = start; i < end; i++) {
+		if ((map & bit) == 0)
+			break;
+		if ((i & (NBBY - 1)) != (NBBY - 1)) {
+			bit <<= 1;
+		} else {
+			map = *mapp++;
+			bit = 1;
+		}
+	}
+	forw = i - start;
+	/*
+	 * Find the size of the cluster going backward.
+	 */
+	start = blkno - 1;
+	end = start - fs->fs_contigsumsize;
+	if (end < 0)
+		end = -1;
+	mapp = &freemapp[start / NBBY];
+	map = *mapp--;
+	bit = 1 << (start % NBBY);
+	for (i = start; i > end; i--) {
+		if ((map & bit) == 0)
+			break;
+		if ((i & (NBBY - 1)) != 0) {
+			bit >>= 1;
+		} else {
+			map = *mapp--;
+			bit = 1 << (NBBY - 1);
+		}
+	}
+	back = start - i;
+	/*
+	 * Account for old cluster and the possibly new forward and
+	 * back clusters.
+	 */
+	i = back + forw + 1;
+	if (i > fs->fs_contigsumsize)
+		i = fs->fs_contigsumsize;
+	ufs_add32(sump[i], cnt, needswap);
+	if (back > 0)
+		ufs_add32(sump[back], -cnt, needswap);
+	if (forw > 0)
+		ufs_add32(sump[forw], -cnt, needswap);
+
+	/*
+	 * Update cluster summary information.
+	 */
+	lp = &sump[fs->fs_contigsumsize];
+	for (i = fs->fs_contigsumsize; i > 0; i--)
+		if (ufs_rw32(*lp--, needswap) > 0)
+			break;
+#if defined(_KERNEL)
+	fs->fs_maxcluster[ufs_rw32(cgp->cg_cgx, needswap)] = i;
+#endif
+}
diff --git a/sys/ufs/ffs/ffs_tables.c b/sys/ufs/ffs/ffs_tables.c
new file mode 100644
index 000000000..29f454247
--- /dev/null
+++ b/sys/ufs/ffs/ffs_tables.c
@@ -0,0 +1,141 @@
+/*	$NetBSD: ffs_tables.c,v 1.9 2005/12/11 12:25:25 christos Exp $	*/
+
+/*
+ * Copyright (c) 1982, 1986, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ffs_tables.c	8.1 (Berkeley) 6/11/93
+ */
+
+#if HAVE_NBTOOL_CONFIG_H
+#include "nbtool_config.h"
+#endif
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_tables.c,v 1.9 2005/12/11 12:25:25 christos Exp $");
+
+#include <sys/param.h>
+
+/*
+ * Bit patterns for identifying fragments in the block map
+ * used as ((map & around) == inside)
+ */
+const int around[9] = {
+	0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff
+};
+const int inside[9] = {
+	0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
+};
+
+/*
+ * Given a block map bit pattern, the frag tables tell whether a
+ * particular size fragment is available.
+ *
+ * used as:
+ * if ((1 << (size - 1)) & fragtbl[fs->fs_frag][map] {
+ *	at least one fragment of the indicated size is available
+ * }
+ *
+ * These tables are used by the scanc instruction on the VAX to
+ * quickly find an appropriate fragment.
+ */
+const u_char fragtbl124[256] = {
+	0x00, 0x16, 0x16, 0x2a, 0x16, 0x16, 0x26, 0x4e,
+	0x16, 0x16, 0x16, 0x3e, 0x2a, 0x3e, 0x4e, 0x8a,
+	0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+	0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+	0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+	0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+	0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e,
+	0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa,
+	0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+	0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+	0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+	0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+	0x26, 0x36, 0x36, 0x2e, 0x36, 0x36, 0x26, 0x6e,
+	0x36, 0x36, 0x36, 0x3e, 0x2e, 0x3e, 0x6e, 0xae,
+	0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e,
+	0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce,
+	0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+	0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+	0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+	0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+	0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+	0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+	0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e,
+	0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe,
+	0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e,
+	0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa,
+	0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e,
+	0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe,
+	0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e,
+	0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce,
+	0x8a, 0x9e, 0x9e, 0xaa, 0x9e, 0x9e, 0xae, 0xce,
+	0x9e, 0x9e, 0x9e, 0xbe, 0xaa, 0xbe, 0xce, 0x8a,
+};
+
+const u_char fragtbl8[256] = {
+	0x00, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x04,
+	0x01, 0x01, 0x01, 0x03, 0x02, 0x03, 0x04, 0x08,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+	0x02, 0x03, 0x03, 0x02, 0x04, 0x05, 0x08, 0x10,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+	0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+	0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06,
+	0x04, 0x05, 0x05, 0x06, 0x08, 0x09, 0x10, 0x20,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+	0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+	0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11,
+	0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06,
+	0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a,
+	0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04,
+	0x08, 0x09, 0x09, 0x0a, 0x10, 0x11, 0x20, 0x40,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+	0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+	0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11,
+	0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+	0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07,
+	0x05, 0x05, 0x05, 0x07, 0x09, 0x09, 0x11, 0x21,
+	0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06,
+	0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a,
+	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07,
+	0x02, 0x03, 0x03, 0x02, 0x06, 0x07, 0x0a, 0x12,
+	0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04,
+	0x05, 0x05, 0x05, 0x07, 0x06, 0x07, 0x04, 0x0c,
+	0x08, 0x09, 0x09, 0x0a, 0x09, 0x09, 0x0a, 0x0c,
+	0x10, 0x11, 0x11, 0x12, 0x20, 0x21, 0x40, 0x80,
+};
+
+/*
+ * The actual fragtbl array.
+ */
+const u_char * const fragtbl[MAXFRAG + 1] = {
+	0, fragtbl124, fragtbl124, 0, fragtbl124, 0, 0, 0, fragtbl8,
+};
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
new file mode 100644
index 000000000..28bbe32dc
--- /dev/null
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -0,0 +1,2144 @@
+/*	$NetBSD: ffs_vfsops.c,v 1.271 2011/11/14 18:35:14 hannken Exp $	*/
+
+/*-
+ * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc, and by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1989, 1991, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ffs_vfsops.c	8.31 (Berkeley) 5/20/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.271 2011/11/14 18:35:14 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#include "opt_wapbl.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/socket.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+#include <sys/device.h>
+#include <sys/disk.h>
+#include <sys/mbuf.h>
+#include <sys/file.h>
+#include <sys/disklabel.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/lock.h>
+#include <sys/sysctl.h>
+#include <sys/conf.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+#include <sys/module.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+MODULE(MODULE_CLASS_VFS, ffs, NULL);
+
+static int	ffs_vfs_fsync(vnode_t *, int);
+
+static struct sysctllog *ffs_sysctl_log;
+
+/* how many times ffs_init() was called */
+int ffs_initcount = 0;
+
+extern const struct vnodeopv_desc ffs_vnodeop_opv_desc;
+extern const struct vnodeopv_desc ffs_specop_opv_desc;
+extern const struct vnodeopv_desc ffs_fifoop_opv_desc;
+
+const struct vnodeopv_desc * const ffs_vnodeopv_descs[] = {
+	&ffs_vnodeop_opv_desc,
+	&ffs_specop_opv_desc,
+	&ffs_fifoop_opv_desc,
+	NULL,
+};
+
+struct vfsops ffs_vfsops = {
+	MOUNT_FFS,
+	sizeof (struct ufs_args),
+	ffs_mount,
+	ufs_start,
+	ffs_unmount,
+	ufs_root,
+	ufs_quotactl,
+	ffs_statvfs,
+	ffs_sync,
+	ffs_vget,
+	ffs_fhtovp,
+	ffs_vptofh,
+	ffs_init,
+	ffs_reinit,
+	ffs_done,
+	ffs_mountroot,
+	ffs_snapshot,
+	ffs_extattrctl,
+	ffs_suspendctl,
+	genfs_renamelock_enter,
+	genfs_renamelock_exit,
+	ffs_vfs_fsync,
+	ffs_vnodeopv_descs,
+	0,
+	{ NULL, NULL },
+};
+
+static const struct genfs_ops ffs_genfsops = {
+	.gop_size = ffs_gop_size,
+	.gop_alloc = ufs_gop_alloc,
+	.gop_write = genfs_gop_write,
+	.gop_markupdate = ufs_gop_markupdate,
+};
+
+static const struct ufs_ops ffs_ufsops = {
+	.uo_itimes = ffs_itimes,
+	.uo_update = ffs_update,
+	.uo_truncate = ffs_truncate,
+	.uo_valloc = ffs_valloc,
+	.uo_vfree = ffs_vfree,
+	.uo_balloc = ffs_balloc,
+	.uo_unmark_vnode = (void (*)(vnode_t *))nullop,
+};
+
+static int
+ffs_modcmd(modcmd_t cmd, void *arg)
+{
+	int error;
+
+#if 0
+	extern int doasyncfree;
+#endif
+#ifdef UFS_EXTATTR
+	extern int ufs_extattr_autocreate;
+#endif
+	extern int ffs_log_changeopt;
+
+	switch (cmd) {
+	case MODULE_CMD_INIT:
+		error = vfs_attach(&ffs_vfsops);
+		if (error != 0)
+			break;
+
+		sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+			       CTLFLAG_PERMANENT,
+			       CTLTYPE_NODE, "vfs", NULL,
+			       NULL, 0, NULL, 0,
+			       CTL_VFS, CTL_EOL);
+		sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+			       CTLFLAG_PERMANENT,
+			       CTLTYPE_NODE, "ffs",
+			       SYSCTL_DESCR("Berkeley Fast File System"),
+			       NULL, 0, NULL, 0,
+			       CTL_VFS, 1, CTL_EOL);
+		/*
+		 * @@@ should we even bother with these first three?
+		 */
+		sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+			       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+			       CTLTYPE_INT, "doclusterread", NULL,
+			       sysctl_notavail, 0, NULL, 0,
+			       CTL_VFS, 1, FFS_CLUSTERREAD, CTL_EOL);
+		sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+			       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+			       CTLTYPE_INT, "doclusterwrite", NULL,
+			       sysctl_notavail, 0, NULL, 0,
+			       CTL_VFS, 1, FFS_CLUSTERWRITE, CTL_EOL);
+		sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+			       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+			       CTLTYPE_INT, "doreallocblks", NULL,
+			       sysctl_notavail, 0, NULL, 0,
+			       CTL_VFS, 1, FFS_REALLOCBLKS, CTL_EOL);
+#if 0
+		sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+			       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+			       CTLTYPE_INT, "doasyncfree",
+			       SYSCTL_DESCR("Release dirty blocks asynchronously"),
+			       NULL, 0, &doasyncfree, 0,
+			       CTL_VFS, 1, FFS_ASYNCFREE, CTL_EOL);
+#endif
+		sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+			       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+			       CTLTYPE_INT, "log_changeopt",
+			       SYSCTL_DESCR("Log changes in optimization strategy"),
+			       NULL, 0, &ffs_log_changeopt, 0,
+			       CTL_VFS, 1, FFS_LOG_CHANGEOPT, CTL_EOL);
+#ifdef UFS_EXTATTR
+		sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+			       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+			       CTLTYPE_INT, "extattr_autocreate",
+			       SYSCTL_DESCR("Size of attribute for "
+					    "backing file autocreation"),
+			       NULL, 0, &ufs_extattr_autocreate, 0,
+			       CTL_VFS, 1, FFS_EXTATTR_AUTOCREATE, CTL_EOL);
+		
+#endif /* UFS_EXTATTR */
+
+		break;
+	case MODULE_CMD_FINI:
+		error = vfs_detach(&ffs_vfsops);
+		if (error != 0)
+			break;
+		sysctl_teardown(&ffs_sysctl_log);
+		break;
+	default:
+		error = ENOTTY;
+		break;
+	}
+
+	return (error);
+}
+
+pool_cache_t ffs_inode_cache;
+pool_cache_t ffs_dinode1_cache;
+pool_cache_t ffs_dinode2_cache;
+
+static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, daddr_t);
+static void ffs_oldfscompat_write(struct fs *, struct ufsmount *);
+
+/*
+ * Called by main() when ffs is going to be mounted as root.
+ */
+
+int
+ffs_mountroot(void)
+{
+	struct fs *fs;
+	struct mount *mp;
+	struct lwp *l = curlwp;			/* XXX */
+	struct ufsmount *ump;
+	int error;
+
+	if (device_class(root_device) != DV_DISK)
+		return (ENODEV);
+
+	if ((error = vfs_rootmountalloc(MOUNT_FFS, "root_device", &mp))) {
+		vrele(rootvp);
+		return (error);
+	}
+
+	/*
+	 * We always need to be able to mount the root file system.
+	 */
+	mp->mnt_flag |= MNT_FORCE;
+	if ((error = ffs_mountfs(rootvp, mp, l)) != 0) {
+		vfs_unbusy(mp, false, NULL);
+		vfs_destroy(mp);
+		return (error);
+	}
+	mp->mnt_flag &= ~MNT_FORCE;
+	mutex_enter(&mountlist_lock);
+	CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+	mutex_exit(&mountlist_lock);
+	ump = VFSTOUFS(mp);
+	fs = ump->um_fs;
+	memset(fs->fs_fsmnt, 0, sizeof(fs->fs_fsmnt));
+	(void)copystr(mp->mnt_stat.f_mntonname, fs->fs_fsmnt, MNAMELEN - 1, 0);
+	(void)ffs_statvfs(mp, &mp->mnt_stat);
+	vfs_unbusy(mp, false, NULL);
+	setrootfstime((time_t)fs->fs_time);
+	return (0);
+}
+
+/*
+ * VFS Operations.
+ *
+ * mount system call
+ */
+int
+ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
+{
+	struct lwp *l = curlwp;
+	struct vnode *devvp = NULL;
+	struct ufs_args *args = data;
+	struct ufsmount *ump = NULL;
+	struct fs *fs;
+	int error = 0, flags, update;
+	mode_t accessmode;
+
+	if (*data_len < sizeof *args)
+		return EINVAL;
+
+	if (mp->mnt_flag & MNT_GETARGS) {
+		ump = VFSTOUFS(mp);
+		if (ump == NULL)
+			return EIO;
+		args->fspec = NULL;
+		*data_len = sizeof *args;
+		return 0;
+	}
+
+	update = mp->mnt_flag & MNT_UPDATE;
+
+	/* Check arguments */
+	if (args->fspec != NULL) {
+		/*
+		 * Look up the name and verify that it's sane.
+		 */
+		error = namei_simple_user(args->fspec,
+					NSM_FOLLOW_NOEMULROOT, &devvp);
+		if (error != 0)
+			return (error);
+
+		if (!update) {
+			/*
+			 * Be sure this is a valid block device
+			 */
+			if (devvp->v_type != VBLK)
+				error = ENOTBLK;
+			else if (bdevsw_lookup(devvp->v_rdev) == NULL)
+				error = ENXIO;
+		} else {
+			/*
+			 * Be sure we're still naming the same device
+			 * used for our initial mount
+			 */
+			ump = VFSTOUFS(mp);
+			if (devvp != ump->um_devvp) {
+				if (devvp->v_rdev != ump->um_devvp->v_rdev)
+					error = EINVAL;
+				else {
+					vrele(devvp);
+					devvp = ump->um_devvp;
+					vref(devvp);
+				}
+			}
+		}
+	} else {
+		if (!update) {
+			/* New mounts must have a filename for the device */
+			return (EINVAL);
+		} else {
+			/* Use the extant mount */
+			ump = VFSTOUFS(mp);
+			devvp = ump->um_devvp;
+			vref(devvp);
+		}
+	}
+
+	/*
+	 * If mount by non-root, then verify that user has necessary
+	 * permissions on the device.
+	 *
+	 * Permission to update a mount is checked higher, so here we presume
+	 * updating the mount is okay (for example, as far as securelevel goes)
+	 * which leaves us with the normal check.
+	 */
+	if (error == 0) {
+		accessmode = VREAD;
+		if (update ?
+		    (mp->mnt_iflag & IMNT_WANTRDWR) != 0 :
+		    (mp->mnt_flag & MNT_RDONLY) == 0)
+			accessmode |= VWRITE;
+		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+		error = genfs_can_mount(devvp, accessmode, l->l_cred);
+		VOP_UNLOCK(devvp);
+	}
+
+	if (error) {
+		vrele(devvp);
+		return (error);
+	}
+
+#ifdef WAPBL
+	/* WAPBL can only be enabled on a r/w mount. */
+	if ((mp->mnt_flag & MNT_RDONLY) && !(mp->mnt_iflag & IMNT_WANTRDWR)) {
+		mp->mnt_flag &= ~MNT_LOG;
+	}
+#else /* !WAPBL */
+	mp->mnt_flag &= ~MNT_LOG;
+#endif /* !WAPBL */
+
+	if (!update) {
+		int xflags;
+
+		if (mp->mnt_flag & MNT_RDONLY)
+			xflags = FREAD;
+		else
+			xflags = FREAD | FWRITE;
+		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+		error = VOP_OPEN(devvp, xflags, FSCRED);
+		VOP_UNLOCK(devvp);
+		if (error)
+			goto fail;
+		error = ffs_mountfs(devvp, mp, l);
+		if (error) {
+			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+			(void)VOP_CLOSE(devvp, xflags, NOCRED);
+			VOP_UNLOCK(devvp);
+			goto fail;
+		}
+
+		ump = VFSTOUFS(mp);
+		fs = ump->um_fs;
+	} else {
+		/*
+		 * Update the mount.
+		 */
+
+		/*
+		 * The initial mount got a reference on this
+		 * device, so drop the one obtained via
+		 * namei(), above.
+		 */
+		vrele(devvp);
+
+		ump = VFSTOUFS(mp);
+		fs = ump->um_fs;
+		if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
+			/*
+			 * Changing from r/w to r/o
+			 */
+			flags = WRITECLOSE;
+			if (mp->mnt_flag & MNT_FORCE)
+				flags |= FORCECLOSE;
+			error = ffs_flushfiles(mp, flags, l);
+			if (error == 0)
+				error = UFS_WAPBL_BEGIN(mp);
+			if (error == 0 &&
+			    ffs_cgupdate(ump, MNT_WAIT) == 0 &&
+			    fs->fs_clean & FS_WASCLEAN) {
+				if (mp->mnt_flag & MNT_SOFTDEP)
+					fs->fs_flags &= ~FS_DOSOFTDEP;
+				fs->fs_clean = FS_ISCLEAN;
+				(void) ffs_sbupdate(ump, MNT_WAIT);
+			}
+			if (error == 0)
+				UFS_WAPBL_END(mp);
+			if (error)
+				return (error);
+		}
+
+#ifdef WAPBL
+		if ((mp->mnt_flag & MNT_LOG) == 0) {
+			error = ffs_wapbl_stop(mp, mp->mnt_flag & MNT_FORCE);
+			if (error)
+				return error;
+		}
+#endif /* WAPBL */
+
+		if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
+			/*
+			 * Finish change from r/w to r/o
+			 */
+			fs->fs_ronly = 1;
+			fs->fs_fmod = 0;
+		}
+
+		if (mp->mnt_flag & MNT_RELOAD) {
+			error = ffs_reload(mp, l->l_cred, l);
+			if (error)
+				return (error);
+		}
+
+		if (fs->fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) {
+			/*
+			 * Changing from read-only to read/write
+			 */
+#ifndef QUOTA2
+			if (fs->fs_flags & FS_DOQUOTA2) {
+				ump->um_flags |= UFS_QUOTA2;
+				uprintf("%s: options QUOTA2 not enabled%s\n",
+				    mp->mnt_stat.f_mntonname,
+				    (mp->mnt_flag & MNT_FORCE) ? "" :
+				    ", not mounting");
+				return EINVAL;
+			}
+#endif
+			fs->fs_ronly = 0;
+			fs->fs_clean <<= 1;
+			fs->fs_fmod = 1;
+#ifdef WAPBL
+			if (fs->fs_flags & FS_DOWAPBL) {
+				printf("%s: replaying log to disk\n",
+				    fs->fs_fsmnt);
+				KDASSERT(mp->mnt_wapbl_replay);
+				error = wapbl_replay_write(mp->mnt_wapbl_replay,
+							   devvp);
+				if (error) {
+					return error;
+				}
+				wapbl_replay_stop(mp->mnt_wapbl_replay);
+				fs->fs_clean = FS_WASCLEAN;
+			}
+#endif /* WAPBL */
+			if (fs->fs_snapinum[0] != 0)
+				ffs_snapshot_mount(mp);
+		}
+
+#ifdef WAPBL
+		error = ffs_wapbl_start(mp);
+		if (error)
+			return error;
+#endif /* WAPBL */
+
+#ifdef QUOTA2
+		if (!fs->fs_ronly) {
+			error = ffs_quota2_mount(mp);
+			if (error) {
+				return error;
+			}
+		}
+#endif
+		if (args->fspec == NULL)
+			return 0;
+	}
+
+	error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
+	    UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+	if (error == 0)
+		(void)strncpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname,
+		    sizeof(fs->fs_fsmnt));
+	fs->fs_flags &= ~FS_DOSOFTDEP;
+	if (fs->fs_fmod != 0) {	/* XXX */
+		int err;
+
+		fs->fs_fmod = 0;
+		if (fs->fs_clean & FS_WASCLEAN)
+			fs->fs_time = time_second;
+		else {
+			printf("%s: file system not clean (fs_clean=%#x); "
+			    "please fsck(8)\n", mp->mnt_stat.f_mntfromname,
+			    fs->fs_clean);
+			printf("%s: lost blocks %" PRId64 " files %d\n",
+			    mp->mnt_stat.f_mntfromname, fs->fs_pendingblocks,
+			    fs->fs_pendinginodes);
+		}
+		err = UFS_WAPBL_BEGIN(mp);
+		if (err == 0) {
+			(void) ffs_cgupdate(ump, MNT_WAIT);
+			UFS_WAPBL_END(mp);
+		}
+	}
+	if ((mp->mnt_flag & MNT_SOFTDEP) != 0) {
+		printf("%s: `-o softdep' is no longer supported, "
+		    "consider `-o log'\n", mp->mnt_stat.f_mntfromname);
+		mp->mnt_flag &= ~MNT_SOFTDEP;
+	}
+
+	return (error);
+
+fail:
+	vrele(devvp);
+	return (error);
+}
+
+/*
+ * Reload all incore data for a filesystem (used after running fsck on
+ * the root filesystem and finding things to fix). The filesystem must
+ * be mounted read-only.
+ *
+ * Things to do to update the mount:
+ *	1) invalidate all cached meta-data.
+ *	2) re-read superblock from disk.
+ *	3) re-read summary information from disk.
+ *	4) invalidate all inactive vnodes.
+ *	5) invalidate all cached file data.
+ *	6) re-read inode data for all active vnodes.
+ */
+int
+ffs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l)
+{
+	struct vnode *vp, *mvp, *devvp;
+	struct inode *ip;
+	void *space;
+	struct buf *bp;
+	struct fs *fs, *newfs;
+	struct dkwedge_info dkw;
+	int i, bsize, blks, error;
+	int32_t *lp;
+	struct ufsmount *ump;
+	daddr_t sblockloc;
+
+	if ((mp->mnt_flag & MNT_RDONLY) == 0)
+		return (EINVAL);
+
+	ump = VFSTOUFS(mp);
+	/*
+	 * Step 1: invalidate all cached meta-data.
+	 */
+	devvp = ump->um_devvp;
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+	error = vinvalbuf(devvp, 0, cred, l, 0, 0);
+	VOP_UNLOCK(devvp);
+	if (error)
+		panic("ffs_reload: dirty1");
+	/*
+	 * Step 2: re-read superblock from disk.
+	 */
+	fs = ump->um_fs;
+
+	/* XXX we don't handle possibility that superblock moved. */
+	error = bread(devvp, fs->fs_sblockloc / DEV_BSIZE, fs->fs_sbsize,
+		      NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return (error);
+	}
+	newfs = malloc(fs->fs_sbsize, M_UFSMNT, M_WAITOK);
+	memcpy(newfs, bp->b_data, fs->fs_sbsize);
+#ifdef FFS_EI
+	if (ump->um_flags & UFS_NEEDSWAP) {
+		ffs_sb_swap((struct fs*)bp->b_data, newfs);
+		fs->fs_flags |= FS_SWAPPED;
+	} else
+#endif
+		fs->fs_flags &= ~FS_SWAPPED;
+	if ((newfs->fs_magic != FS_UFS1_MAGIC &&
+	     newfs->fs_magic != FS_UFS2_MAGIC)||
+	     newfs->fs_bsize > MAXBSIZE ||
+	     newfs->fs_bsize < sizeof(struct fs)) {
+		brelse(bp, 0);
+		free(newfs, M_UFSMNT);
+		return (EIO);		/* XXX needs translation */
+	}
+	/* Store off old fs_sblockloc for fs_oldfscompat_read. */
+	sblockloc = fs->fs_sblockloc;
+	/*
+	 * Copy pointer fields back into superblock before copying in	XXX
+	 * new superblock. These should really be in the ufsmount.	XXX
+	 * Note that important parameters (eg fs_ncg) are unchanged.
+	 */
+	newfs->fs_csp = fs->fs_csp;
+	newfs->fs_maxcluster = fs->fs_maxcluster;
+	newfs->fs_contigdirs = fs->fs_contigdirs;
+	newfs->fs_ronly = fs->fs_ronly;
+	newfs->fs_active = fs->fs_active;
+	memcpy(fs, newfs, (u_int)fs->fs_sbsize);
+	brelse(bp, 0);
+	free(newfs, M_UFSMNT);
+
+	/* Recheck for apple UFS filesystem */
+	ump->um_flags &= ~UFS_ISAPPLEUFS;
+	/* First check to see if this is tagged as an Apple UFS filesystem
+	 * in the disklabel
+	 */
+	if (getdiskinfo(devvp, &dkw) == 0 &&
+	    strcmp(dkw.dkw_ptype, DKW_PTYPE_APPLEUFS) == 0)
+		ump->um_flags |= UFS_ISAPPLEUFS;
+#ifdef APPLE_UFS
+	else {
+		/* Manually look for an apple ufs label, and if a valid one
+		 * is found, then treat it like an Apple UFS filesystem anyway
+		 *
+		 * EINVAL is most probably a blocksize or alignment problem,
+		 * it is unlikely that this is an Apple UFS filesystem then.
+		 */
+		error = bread(devvp, (daddr_t)(APPLEUFS_LABEL_OFFSET / DEV_BSIZE),
+			APPLEUFS_LABEL_SIZE, cred, 0, &bp);
+		if (error && error != EINVAL) {
+			brelse(bp, 0);
+			return (error);
+		}
+		if (error == 0) {
+			error = ffs_appleufs_validate(fs->fs_fsmnt,
+				(struct appleufslabel *)bp->b_data, NULL);
+			if (error == 0)
+				ump->um_flags |= UFS_ISAPPLEUFS;
+		}
+		brelse(bp, 0);
+		bp = NULL;
+	}
+#else
+	if (ump->um_flags & UFS_ISAPPLEUFS)
+		return (EIO);
+#endif
+
+	if (UFS_MPISAPPLEUFS(ump)) {
+		/* see comment about NeXT below */
+		ump->um_maxsymlinklen = APPLEUFS_MAXSYMLINKLEN;
+		ump->um_dirblksiz = APPLEUFS_DIRBLKSIZ;
+		mp->mnt_iflag |= IMNT_DTYPE;
+	} else {
+		ump->um_maxsymlinklen = fs->fs_maxsymlinklen;
+		ump->um_dirblksiz = DIRBLKSIZ;
+		if (ump->um_maxsymlinklen > 0)
+			mp->mnt_iflag |= IMNT_DTYPE;
+		else
+			mp->mnt_iflag &= ~IMNT_DTYPE;
+	}
+	ffs_oldfscompat_read(fs, ump, sblockloc);
+
+	mutex_enter(&ump->um_lock);
+	ump->um_maxfilesize = fs->fs_maxfilesize;
+	if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) {
+		uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n",
+		    mp->mnt_stat.f_mntonname, fs->fs_flags,
+		    (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting");
+		if ((mp->mnt_flag & MNT_FORCE) == 0) {
+			mutex_exit(&ump->um_lock);
+			return (EINVAL);
+		}
+	}
+	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
+		fs->fs_pendingblocks = 0;
+		fs->fs_pendinginodes = 0;
+	}
+	mutex_exit(&ump->um_lock);
+
+	ffs_statvfs(mp, &mp->mnt_stat);
+	/*
+	 * Step 3: re-read summary information from disk.
+	 */
+	blks = howmany(fs->fs_cssize, fs->fs_fsize);
+	space = fs->fs_csp;
+	for (i = 0; i < blks; i += fs->fs_frag) {
+		bsize = fs->fs_bsize;
+		if (i + fs->fs_frag > blks)
+			bsize = (blks - i) * fs->fs_fsize;
+		error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), bsize,
+			      NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp, 0);
+			return (error);
+		}
+#ifdef FFS_EI
+		if (UFS_FSNEEDSWAP(fs))
+			ffs_csum_swap((struct csum *)bp->b_data,
+			    (struct csum *)space, bsize);
+		else
+#endif
+			memcpy(space, bp->b_data, (size_t)bsize);
+		space = (char *)space + bsize;
+		brelse(bp, 0);
+	}
+	if (fs->fs_snapinum[0] != 0)
+		ffs_snapshot_mount(mp);
+	/*
+	 * We no longer know anything about clusters per cylinder group.
+	 */
+	if (fs->fs_contigsumsize > 0) {
+		lp = fs->fs_maxcluster;
+		for (i = 0; i < fs->fs_ncg; i++)
+			*lp++ = fs->fs_contigsumsize;
+	}
+
+	/* Allocate a marker vnode. */
+	mvp = vnalloc(mp);
+	/*
+	 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+	 * and vclean() can be called indirectly
+	 */
+	mutex_enter(&mntvnode_lock);
+ loop:
+	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+		vmark(mvp, vp);
+		if (vp->v_mount != mp || vismarker(vp))
+			continue;
+		/*
+		 * Step 4: invalidate all inactive vnodes.
+		 */
+		if (vrecycle(vp, &mntvnode_lock, l)) {
+			mutex_enter(&mntvnode_lock);
+			(void)vunmark(mvp);
+			goto loop;
+		}
+		/*
+		 * Step 5: invalidate all cached file data.
+		 */
+		mutex_enter(vp->v_interlock);
+		mutex_exit(&mntvnode_lock);
+		if (vget(vp, LK_EXCLUSIVE)) {
+			(void)vunmark(mvp);
+			goto loop;
+		}
+		if (vinvalbuf(vp, 0, cred, l, 0, 0))
+			panic("ffs_reload: dirty2");
+		/*
+		 * Step 6: re-read inode data for all active vnodes.
+		 */
+		ip = VTOI(vp);
+		error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+			      (int)fs->fs_bsize, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp, 0);
+			vput(vp);
+			(void)vunmark(mvp);
+			break;
+		}
+		ffs_load_inode(bp, ip, fs, ip->i_number);
+		brelse(bp, 0);
+		vput(vp);
+		mutex_enter(&mntvnode_lock);
+	}
+	mutex_exit(&mntvnode_lock);
+	vnfree(mvp);
+	return (error);
+}
+
+/*
+ * Possible superblock locations ordered from most to least likely.
+ */
+static const int sblock_try[] = SBLOCKSEARCH;
+
+/*
+ * Common code for mount and mountroot
+ */
+int
+ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
+{
+	struct ufsmount *ump;
+	struct buf *bp;
+	struct fs *fs;
+	dev_t dev;
+	struct dkwedge_info dkw;
+	void *space;
+	daddr_t sblockloc, fsblockloc;
+	int blks, fstype;
+	int error, i, bsize, ronly, bset = 0;
+#ifdef FFS_EI
+	int needswap = 0;		/* keep gcc happy */
+#endif
+	int32_t *lp;
+	kauth_cred_t cred;
+	u_int32_t sbsize = 8192;	/* keep gcc happy*/
+	int32_t fsbsize;
+
+	dev = devvp->v_rdev;
+	cred = l ? l->l_cred : NOCRED;
+
+	/* Flush out any old buffers remaining from a previous use. */
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+	error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
+	VOP_UNLOCK(devvp);
+	if (error)
+		return (error);
+
+	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
+
+	bp = NULL;
+	ump = NULL;
+	fs = NULL;
+	sblockloc = 0;
+	fstype = 0;
+
+	error = fstrans_mount(mp);
+	if (error)
+		return error;
+
+	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK);
+	memset(ump, 0, sizeof *ump);
+	mutex_init(&ump->um_lock, MUTEX_DEFAULT, IPL_NONE);
+	error = ffs_snapshot_init(ump);
+	if (error)
+		goto out;
+	ump->um_ops = &ffs_ufsops;
+
+#ifdef WAPBL
+ sbagain:
+#endif
+	/*
+	 * Try reading the superblock in each of its possible locations.
+	 */
+	for (i = 0; ; i++) {
+		if (bp != NULL) {
+			brelse(bp, BC_NOCACHE);
+			bp = NULL;
+		}
+		if (sblock_try[i] == -1) {
+			error = EINVAL;
+			fs = NULL;
+			goto out;
+		}
+		error = bread(devvp, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE, cred,
+			      0, &bp);
+		if (error) {
+			fs = NULL;
+			goto out;
+		}
+		fs = (struct fs*)bp->b_data;
+		fsblockloc = sblockloc = sblock_try[i];
+		if (fs->fs_magic == FS_UFS1_MAGIC) {
+			sbsize = fs->fs_sbsize;
+			fstype = UFS1;
+			fsbsize = fs->fs_bsize;
+#ifdef FFS_EI
+			needswap = 0;
+		} else if (fs->fs_magic == bswap32(FS_UFS1_MAGIC)) {
+			sbsize = bswap32(fs->fs_sbsize);
+			fstype = UFS1;
+			fsbsize = bswap32(fs->fs_bsize);
+			needswap = 1;
+#endif
+		} else if (fs->fs_magic == FS_UFS2_MAGIC) {
+			sbsize = fs->fs_sbsize;
+			fstype = UFS2;
+			fsbsize = fs->fs_bsize;
+#ifdef FFS_EI
+			needswap = 0;
+		} else if (fs->fs_magic == bswap32(FS_UFS2_MAGIC)) {
+			sbsize = bswap32(fs->fs_sbsize);
+			fstype = UFS2;
+			fsbsize = bswap32(fs->fs_bsize);
+			needswap = 1;
+#endif
+		} else
+			continue;
+
+
+		/* fs->fs_sblockloc isn't defined for old filesystems */
+		if (fstype == UFS1 && !(fs->fs_old_flags & FS_FLAGS_UPDATED)) {
+			if (sblockloc == SBLOCK_UFS2)
+				/*
+				 * This is likely to be the first alternate
+				 * in a filesystem with 64k blocks.
+				 * Don't use it.
+				 */
+				continue;
+			fsblockloc = sblockloc;
+		} else {
+			fsblockloc = fs->fs_sblockloc;
+#ifdef FFS_EI
+			if (needswap)
+				fsblockloc = bswap64(fsblockloc);
+#endif
+		}
+
+		/* Check we haven't found an alternate superblock */
+		if (fsblockloc != sblockloc)
+			continue;
+
+		/* Validate size of superblock */
+		if (sbsize > MAXBSIZE || sbsize < sizeof(struct fs))
+			continue;
+
+		/* Check that we can handle the file system blocksize */
+		if (fsbsize > MAXBSIZE) {
+			printf("ffs_mountfs: block size (%d) > MAXBSIZE (%d)\n",
+			    fsbsize, MAXBSIZE);
+			continue;
+		}
+
+		/* Ok seems to be a good superblock */
+		break;
+	}
+
+	fs = malloc((u_long)sbsize, M_UFSMNT, M_WAITOK);
+	memcpy(fs, bp->b_data, sbsize);
+	ump->um_fs = fs;
+
+#ifdef FFS_EI
+	if (needswap) {
+		ffs_sb_swap((struct fs*)bp->b_data, fs);
+		fs->fs_flags |= FS_SWAPPED;
+	} else
+#endif
+		fs->fs_flags &= ~FS_SWAPPED;
+
+#ifdef WAPBL
+	if ((mp->mnt_wapbl_replay == 0) && (fs->fs_flags & FS_DOWAPBL)) {
+		error = ffs_wapbl_replay_start(mp, fs, devvp);
+		if (error && (mp->mnt_flag & MNT_FORCE) == 0)
+			goto out;
+		if (!error) {
+			if (!ronly) {
+				/* XXX fsmnt may be stale. */
+				printf("%s: replaying log to disk\n",
+				    fs->fs_fsmnt);
+				error = wapbl_replay_write(mp->mnt_wapbl_replay,
+				    devvp);
+				if (error)
+					goto out;
+				wapbl_replay_stop(mp->mnt_wapbl_replay);
+				fs->fs_clean = FS_WASCLEAN;
+			} else {
+				/* XXX fsmnt may be stale */
+				printf("%s: replaying log to memory\n",
+				    fs->fs_fsmnt);
+			}
+
+			/* Force a re-read of the superblock */
+			brelse(bp, BC_INVAL);
+			bp = NULL;
+			free(fs, M_UFSMNT);
+			fs = NULL;
+			goto sbagain;
+		}
+	}
+#else /* !WAPBL */
+	if ((fs->fs_flags & FS_DOWAPBL) && (mp->mnt_flag & MNT_FORCE) == 0) {
+		error = EPERM;
+		goto out;
+	}
+#endif /* !WAPBL */
+
+	ffs_oldfscompat_read(fs, ump, sblockloc);
+	ump->um_maxfilesize = fs->fs_maxfilesize;
+
+	if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) {
+		uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n",
+		    mp->mnt_stat.f_mntonname, fs->fs_flags,
+		    (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting");
+		if ((mp->mnt_flag & MNT_FORCE) == 0) {
+			error = EINVAL;
+			goto out;
+		}
+	}
+
+	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
+		fs->fs_pendingblocks = 0;
+		fs->fs_pendinginodes = 0;
+	}
+
+	ump->um_fstype = fstype;
+	if (fs->fs_sbsize < SBLOCKSIZE)
+		brelse(bp, BC_INVAL);
+	else
+		brelse(bp, 0);
+	bp = NULL;
+
+	/* First check to see if this is tagged as an Apple UFS filesystem
+	 * in the disklabel
+	 */
+	if (getdiskinfo(devvp, &dkw) == 0 &&
+	    strcmp(dkw.dkw_ptype, DKW_PTYPE_APPLEUFS) == 0)
+		ump->um_flags |= UFS_ISAPPLEUFS;
+#ifdef APPLE_UFS
+	else {
+		/* Manually look for an apple ufs label, and if a valid one
+		 * is found, then treat it like an Apple UFS filesystem anyway
+		 */
+		error = bread(devvp, (daddr_t)(APPLEUFS_LABEL_OFFSET / DEV_BSIZE),
+			APPLEUFS_LABEL_SIZE, cred, 0, &bp);
+		if (error)
+			goto out;
+		error = ffs_appleufs_validate(fs->fs_fsmnt,
+			(struct appleufslabel *)bp->b_data, NULL);
+		if (error == 0) {
+			ump->um_flags |= UFS_ISAPPLEUFS;
+		}
+		brelse(bp, 0);
+		bp = NULL;
+	}
+#else
+	if (ump->um_flags & UFS_ISAPPLEUFS) {
+		error = EINVAL;
+		goto out;
+	}
+#endif
+
+#if 0
+/*
+ * XXX This code changes the behaviour of mounting dirty filesystems, to
+ * XXX require "mount -f ..." to mount them.  This doesn't match what
+ * XXX mount(8) describes and is disabled for now.
+ */
+	/*
+	 * If the file system is not clean, don't allow it to be mounted
+	 * unless MNT_FORCE is specified.  (Note: MNT_FORCE is always set
+	 * for the root file system.)
+	 */
+	if (fs->fs_flags & FS_DOWAPBL) {
+		/*
+		 * wapbl normally expects to be FS_WASCLEAN when the FS_DOWAPBL
+		 * bit is set, although there's a window in unmount where it
+		 * could be FS_ISCLEAN
+		 */
+		if ((mp->mnt_flag & MNT_FORCE) == 0 &&
+		    (fs->fs_clean & (FS_WASCLEAN | FS_ISCLEAN)) == 0) {
+			error = EPERM;
+			goto out;
+		}
+	} else
+		if ((fs->fs_clean & FS_ISCLEAN) == 0 &&
+		    (mp->mnt_flag & MNT_FORCE) == 0) {
+			error = EPERM;
+			goto out;
+		}
+#endif
+
+	/*
+	 * verify that we can access the last block in the fs
+	 * if we're mounting read/write.
+	 */
+
+	if (!ronly) {
+		error = bread(devvp, fsbtodb(fs, fs->fs_size - 1), fs->fs_fsize,
+		    cred, 0, &bp);
+		if (bp->b_bcount != fs->fs_fsize)
+			error = EINVAL;
+		if (error) {
+			bset = BC_INVAL;
+			goto out;
+		}
+		brelse(bp, BC_INVAL);
+		bp = NULL;
+	}
+
+	fs->fs_ronly = ronly;
+	/* Don't bump fs_clean if we're replaying journal */
+	if (!((fs->fs_flags & FS_DOWAPBL) && (fs->fs_clean & FS_WASCLEAN)))
+		if (ronly == 0) {
+			fs->fs_clean <<= 1;
+			fs->fs_fmod = 1;
+		}
+	bsize = fs->fs_cssize;
+	blks = howmany(bsize, fs->fs_fsize);
+	if (fs->fs_contigsumsize > 0)
+		bsize += fs->fs_ncg * sizeof(int32_t);
+	bsize += fs->fs_ncg * sizeof(*fs->fs_contigdirs);
+	space = malloc((u_long)bsize, M_UFSMNT, M_WAITOK);
+	fs->fs_csp = space;
+	for (i = 0; i < blks; i += fs->fs_frag) {
+		bsize = fs->fs_bsize;
+		if (i + fs->fs_frag > blks)
+			bsize = (blks - i) * fs->fs_fsize;
+		error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), bsize,
+			      cred, 0, &bp);
+		if (error) {
+			free(fs->fs_csp, M_UFSMNT);
+			goto out;
+		}
+#ifdef FFS_EI
+		if (needswap)
+			ffs_csum_swap((struct csum *)bp->b_data,
+				(struct csum *)space, bsize);
+		else
+#endif
+			memcpy(space, bp->b_data, (u_int)bsize);
+
+		space = (char *)space + bsize;
+		brelse(bp, 0);
+		bp = NULL;
+	}
+	if (fs->fs_contigsumsize > 0) {
+		fs->fs_maxcluster = lp = space;
+		for (i = 0; i < fs->fs_ncg; i++)
+			*lp++ = fs->fs_contigsumsize;
+		space = lp;
+	}
+	bsize = fs->fs_ncg * sizeof(*fs->fs_contigdirs);
+	fs->fs_contigdirs = space;
+	space = (char *)space + bsize;
+	memset(fs->fs_contigdirs, 0, bsize);
+		/* Compatibility for old filesystems - XXX */
+	if (fs->fs_avgfilesize <= 0)
+		fs->fs_avgfilesize = AVFILESIZ;
+	if (fs->fs_avgfpdir <= 0)
+		fs->fs_avgfpdir = AFPDIR;
+	fs->fs_active = NULL;
+	mp->mnt_data = ump;
+	mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
+	mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_FFS);
+	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
+	mp->mnt_stat.f_namemax = FFS_MAXNAMLEN;
+	if (UFS_MPISAPPLEUFS(ump)) {
+		/* NeXT used to keep short symlinks in the inode even
+		 * when using FS_42INODEFMT.  In that case fs->fs_maxsymlinklen
+		 * is probably -1, but we still need to be able to identify
+		 * short symlinks.
+		 */
+		ump->um_maxsymlinklen = APPLEUFS_MAXSYMLINKLEN;
+		ump->um_dirblksiz = APPLEUFS_DIRBLKSIZ;
+		mp->mnt_iflag |= IMNT_DTYPE;
+	} else {
+		ump->um_maxsymlinklen = fs->fs_maxsymlinklen;
+		ump->um_dirblksiz = DIRBLKSIZ;
+		if (ump->um_maxsymlinklen > 0)
+			mp->mnt_iflag |= IMNT_DTYPE;
+		else
+			mp->mnt_iflag &= ~IMNT_DTYPE;
+	}
+	mp->mnt_fs_bshift = fs->fs_bshift;
+	mp->mnt_dev_bshift = DEV_BSHIFT;	/* XXX */
+	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_iflag |= IMNT_MPSAFE;
+#ifdef FFS_EI
+	if (needswap)
+		ump->um_flags |= UFS_NEEDSWAP;
+#endif
+	ump->um_mountp = mp;
+	ump->um_dev = dev;
+	ump->um_devvp = devvp;
+	ump->um_nindir = fs->fs_nindir;
+	ump->um_lognindir = ffs(fs->fs_nindir) - 1;
+	ump->um_bptrtodb = fs->fs_fshift - DEV_BSHIFT;
+	ump->um_seqinc = fs->fs_frag;
+	for (i = 0; i < MAXQUOTAS; i++)
+		ump->um_quotas[i] = NULLVP;
+	devvp->v_specmountpoint = mp;
+	if (ronly == 0 && fs->fs_snapinum[0] != 0)
+		ffs_snapshot_mount(mp);
+#ifdef WAPBL
+	if (!ronly) {
+		KDASSERT(fs->fs_ronly == 0);
+		/*
+		 * ffs_wapbl_start() needs mp->mnt_stat initialised if it
+		 * needs to create a new log file in-filesystem.
+		 */
+		ffs_statvfs(mp, &mp->mnt_stat);
+
+		error = ffs_wapbl_start(mp);
+		if (error) {
+			free(fs->fs_csp, M_UFSMNT);
+			goto out;
+		}
+	}
+#endif /* WAPBL */
+	if (ronly == 0) {
+#ifdef QUOTA2
+		error = ffs_quota2_mount(mp);
+		if (error) {
+			free(fs->fs_csp, M_UFSMNT);
+			goto out;
+		}
+#else
+		if (fs->fs_flags & FS_DOQUOTA2) {
+			ump->um_flags |= UFS_QUOTA2;
+			uprintf("%s: options QUOTA2 not enabled%s\n",
+			    mp->mnt_stat.f_mntonname,
+			    (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting");
+			if ((mp->mnt_flag & MNT_FORCE) == 0) {
+				error = EINVAL;
+				free(fs->fs_csp, M_UFSMNT);
+				goto out;
+			}
+		}
+#endif
+	 }
+#ifdef UFS_EXTATTR
+	/*
+	 * Initialize file-backed extended attributes on UFS1 file
+	 * systems.
+	 */
+	if (ump->um_fstype == UFS1)
+		ufs_extattr_uepm_init(&ump->um_extattr);	
+#endif /* UFS_EXTATTR */
+
+	return (0);
+out:
+#ifdef WAPBL
+	if (mp->mnt_wapbl_replay) {
+		wapbl_replay_stop(mp->mnt_wapbl_replay);
+		wapbl_replay_free(mp->mnt_wapbl_replay);
+		mp->mnt_wapbl_replay = 0;
+	}
+#endif
+
+	fstrans_unmount(mp);
+	if (fs)
+		free(fs, M_UFSMNT);
+	devvp->v_specmountpoint = NULL;
+	if (bp)
+		brelse(bp, bset);
+	if (ump) {
+		if (ump->um_oldfscompat)
+			free(ump->um_oldfscompat, M_UFSMNT);
+		mutex_destroy(&ump->um_lock);
+		free(ump, M_UFSMNT);
+		mp->mnt_data = NULL;
+	}
+	return (error);
+}
+
+/*
+ * Sanity checks for loading old filesystem superblocks.
+ * See ffs_oldfscompat_write below for unwound actions.
+ *
+ * XXX - Parts get retired eventually.
+ * Unfortunately new bits get added.
+ */
+static void
+ffs_oldfscompat_read(struct fs *fs, struct ufsmount *ump, daddr_t sblockloc)
+{
+	off_t maxfilesize;
+	int32_t *extrasave;
+
+	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
+		return;
+
+	if (!ump->um_oldfscompat)
+		ump->um_oldfscompat = malloc(512 + 3*sizeof(int32_t),
+		    M_UFSMNT, M_WAITOK);
+
+	memcpy(ump->um_oldfscompat, &fs->fs_old_postbl_start, 512);
+	extrasave = ump->um_oldfscompat;
+	extrasave += 512/sizeof(int32_t);
+	extrasave[0] = fs->fs_old_npsect;
+	extrasave[1] = fs->fs_old_interleave;
+	extrasave[2] = fs->fs_old_trackskew;
+
+	/* These fields will be overwritten by their
+	 * original values in fs_oldfscompat_write, so it is harmless
+	 * to modify them here.
+	 */
+	fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
+	fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
+	fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
+	fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;
+
+	fs->fs_maxbsize = fs->fs_bsize;
+	fs->fs_time = fs->fs_old_time;
+	fs->fs_size = fs->fs_old_size;
+	fs->fs_dsize = fs->fs_old_dsize;
+	fs->fs_csaddr = fs->fs_old_csaddr;
+	fs->fs_sblockloc = sblockloc;
+
+	fs->fs_flags = fs->fs_old_flags | (fs->fs_flags & FS_INTERNAL);
+
+	if (fs->fs_old_postblformat == FS_42POSTBLFMT) {
+		fs->fs_old_nrpos = 8;
+		fs->fs_old_npsect = fs->fs_old_nsect;
+		fs->fs_old_interleave = 1;
+		fs->fs_old_trackskew = 0;
+	}
+
+	if (fs->fs_old_inodefmt < FS_44INODEFMT) {
+		fs->fs_maxfilesize = (u_quad_t) 1LL << 39;
+		fs->fs_qbmask = ~fs->fs_bmask;
+		fs->fs_qfmask = ~fs->fs_fmask;
+	}
+
+	maxfilesize = (u_int64_t)0x80000000 * fs->fs_bsize - 1;
+	if (fs->fs_maxfilesize > maxfilesize)
+		fs->fs_maxfilesize = maxfilesize;
+
+	/* Compatibility for old filesystems */
+	if (fs->fs_avgfilesize <= 0)
+		fs->fs_avgfilesize = AVFILESIZ;
+	if (fs->fs_avgfpdir <= 0)
+		fs->fs_avgfpdir = AFPDIR;
+
+#if 0
+	if (bigcgs) {
+		fs->fs_save_cgsize = fs->fs_cgsize;
+		fs->fs_cgsize = fs->fs_bsize;
+	}
+#endif
+}
+
+/*
+ * Unwinding superblock updates for old filesystems.
+ * See ffs_oldfscompat_read above for details.
+ *
+ * XXX - Parts get retired eventually.
+ * Unfortunately new bits get added.
+ */
+static void
+ffs_oldfscompat_write(struct fs *fs, struct ufsmount *ump)
+{
+	int32_t *extrasave;
+
+	if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+	    (fs->fs_old_flags & FS_FLAGS_UPDATED))
+		return;
+
+	fs->fs_old_time = fs->fs_time;
+	fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
+	fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
+	fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
+	fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
+	fs->fs_old_flags = fs->fs_flags;
+
+#if 0
+	if (bigcgs) {
+		fs->fs_cgsize = fs->fs_save_cgsize;
+	}
+#endif
+
+	memcpy(&fs->fs_old_postbl_start, ump->um_oldfscompat, 512);
+	extrasave = ump->um_oldfscompat;
+	extrasave += 512/sizeof(int32_t);
+	fs->fs_old_npsect = extrasave[0];
+	fs->fs_old_interleave = extrasave[1];
+	fs->fs_old_trackskew = extrasave[2];
+
+}
+
+/*
+ * unmount vfs operation
+ */
+int
+ffs_unmount(struct mount *mp, int mntflags)
+{
+	struct lwp *l = curlwp;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	int error, flags;
+#ifdef WAPBL
+	extern int doforce;
+#endif
+
+	flags = 0;
+	if (mntflags & MNT_FORCE)
+		flags |= FORCECLOSE;
+	if ((error = ffs_flushfiles(mp, flags, l)) != 0)
+		return (error);
+	error = UFS_WAPBL_BEGIN(mp);
+	if (error == 0)
+		if (fs->fs_ronly == 0 &&
+		    ffs_cgupdate(ump, MNT_WAIT) == 0 &&
+		    fs->fs_clean & FS_WASCLEAN) {
+			fs->fs_clean = FS_ISCLEAN;
+			fs->fs_fmod = 0;
+			(void) ffs_sbupdate(ump, MNT_WAIT);
+		}
+	if (error == 0)
+		UFS_WAPBL_END(mp);
+#ifdef WAPBL
+	KASSERT(!(mp->mnt_wapbl_replay && mp->mnt_wapbl));
+	if (mp->mnt_wapbl_replay) {
+		KDASSERT(fs->fs_ronly);
+		wapbl_replay_stop(mp->mnt_wapbl_replay);
+		wapbl_replay_free(mp->mnt_wapbl_replay);
+		mp->mnt_wapbl_replay = 0;
+	}
+	error = ffs_wapbl_stop(mp, doforce && (mntflags & MNT_FORCE));
+	if (error) {
+		return error;
+	}
+#endif /* WAPBL */
+#ifdef UFS_EXTATTR
+	if (ump->um_fstype == UFS1) {
+		ufs_extattr_stop(mp, l);
+		ufs_extattr_uepm_destroy(&ump->um_extattr);
+	}
+#endif /* UFS_EXTATTR */
+
+	if (ump->um_devvp->v_type != VBAD)
+		ump->um_devvp->v_specmountpoint = NULL;
+	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+	(void)VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD | FWRITE,
+		NOCRED);
+	vput(ump->um_devvp);
+	free(fs->fs_csp, M_UFSMNT);
+	free(fs, M_UFSMNT);
+	if (ump->um_oldfscompat != NULL)
+		free(ump->um_oldfscompat, M_UFSMNT);
+	mutex_destroy(&ump->um_lock);
+	ffs_snapshot_fini(ump);
+	free(ump, M_UFSMNT);
+	mp->mnt_data = NULL;
+	mp->mnt_flag &= ~MNT_LOCAL;
+	fstrans_unmount(mp);
+	return (0);
+}
+
+/*
+ * Flush out all the files in a filesystem.
+ */
+int
+ffs_flushfiles(struct mount *mp, int flags, struct lwp *l)
+{
+	extern int doforce;
+	struct ufsmount *ump;
+	int error;
+
+	if (!doforce)
+		flags &= ~FORCECLOSE;
+	ump = VFSTOUFS(mp);
+#ifdef QUOTA
+	if ((error = quota1_umount(mp, flags)) != 0)
+		return (error);
+#endif
+#ifdef QUOTA2
+	if ((error = quota2_umount(mp, flags)) != 0)
+		return (error);
+#endif
+	if ((error = vflush(mp, 0, SKIPSYSTEM | flags)) != 0)
+		return (error);
+	ffs_snapshot_unmount(mp);
+	/*
+	 * Flush all the files.
+	 */
+	error = vflush(mp, NULLVP, flags);
+	if (error)
+		return (error);
+	/*
+	 * Flush filesystem metadata.
+	 */
+	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+	error = VOP_FSYNC(ump->um_devvp, l->l_cred, FSYNC_WAIT, 0, 0);
+	VOP_UNLOCK(ump->um_devvp);
+	if (flags & FORCECLOSE) /* XXXDBJ */
+		error = 0;
+
+#ifdef WAPBL
+	if (error)
+		return error;
+	if (mp->mnt_wapbl) {
+		error = wapbl_flush(mp->mnt_wapbl, 1);
+		if (flags & FORCECLOSE)
+			error = 0;
+	}
+#endif
+
+	return (error);
+}
+
+/*
+ * Get file system statistics.
+ */
+int
+ffs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+	struct ufsmount *ump;
+	struct fs *fs;
+
+	ump = VFSTOUFS(mp);
+	fs = ump->um_fs;
+	mutex_enter(&ump->um_lock);
+	sbp->f_bsize = fs->fs_bsize;
+	sbp->f_frsize = fs->fs_fsize;
+	sbp->f_iosize = fs->fs_bsize;
+	sbp->f_blocks = fs->fs_dsize;
+	sbp->f_bfree = blkstofrags(fs, fs->fs_cstotal.cs_nbfree) +
+	    fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks);
+	sbp->f_bresvd = ((u_int64_t) fs->fs_dsize * (u_int64_t)
+	    fs->fs_minfree) / (u_int64_t) 100;
+	if (sbp->f_bfree > sbp->f_bresvd)
+		sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd;
+	else
+		sbp->f_bavail = 0;
+	sbp->f_files =  fs->fs_ncg * fs->fs_ipg - ROOTINO;
+	sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
+	sbp->f_favail = sbp->f_ffree;
+	sbp->f_fresvd = 0;
+	mutex_exit(&ump->um_lock);
+	copy_statvfs_info(sbp, mp);
+
+	return (0);
+}
+
+/*
+ * Go through the disk queues to initiate sandbagged IO;
+ * go through the inodes to write those that have been modified;
+ * initiate the writing of the super block if it has been modified.
+ *
+ * Note: we are always called with the filesystem marked `MPBUSY'.
+ */
+int
+ffs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
+{
+	struct vnode *vp, *mvp, *nvp;
+	struct inode *ip;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs;
+	int error, allerror = 0;
+	bool is_suspending;
+
+	fs = ump->um_fs;
+	if (fs->fs_fmod != 0 && fs->fs_ronly != 0) {		/* XXX */
+		printf("fs = %s\n", fs->fs_fsmnt);
+		panic("update: rofs mod");
+	}
+
+	/* Allocate a marker vnode. */
+	mvp = vnalloc(mp);
+
+	fstrans_start(mp, FSTRANS_SHARED);
+	is_suspending = (fstrans_getstate(mp) == FSTRANS_SUSPENDING);
+	/*
+	 * Write back each (modified) inode.
+	 */
+	mutex_enter(&mntvnode_lock);
+loop:
+	/*
+	 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+	 * and vclean() can be called indirectly
+	 */
+	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
+		nvp = TAILQ_NEXT(vp, v_mntvnodes);
+		/*
+		 * If the vnode that we are about to sync is no longer
+		 * associated with this mount point, start over.
+		 */
+		if (vp->v_mount != mp)
+			goto loop;
+		/*
+		 * Don't interfere with concurrent scans of this FS.
+		 */
+		if (vismarker(vp))
+			continue;
+		mutex_enter(vp->v_interlock);
+		ip = VTOI(vp);
+
+		/*
+		 * Skip the vnode/inode if inaccessible.
+		 */
+		if (ip == NULL || (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0 ||
+		    vp->v_type == VNON) {
+			mutex_exit(vp->v_interlock);
+			continue;
+		}
+
+		/*
+		 * We deliberately update inode times here.  This will
+		 * prevent a massive queue of updates accumulating, only
+		 * to be handled by a call to unmount.
+		 *
+		 * XXX It would be better to have the syncer trickle these
+		 * out.  Adjustment needed to allow registering vnodes for
+		 * sync when the vnode is clean, but the inode dirty.  Or
+		 * have ufs itself trickle out inode updates.
+		 *
+		 * If doing a lazy sync, we don't care about metadata or
+		 * data updates, because they are handled by each vnode's
+		 * synclist entry.  In this case we are only interested in
+		 * writing back modified inodes.
+		 */
+		if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE |
+		    IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) == 0 &&
+		    (waitfor == MNT_LAZY || (LIST_EMPTY(&vp->v_dirtyblkhd) &&
+		    UVM_OBJ_IS_CLEAN(&vp->v_uobj)))) {
+			mutex_exit(vp->v_interlock);
+			continue;
+		}
+		if (vp->v_type == VBLK && is_suspending) {
+			mutex_exit(vp->v_interlock);
+			continue;
+		}
+		vmark(mvp, vp);
+		mutex_exit(&mntvnode_lock);
+		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT);
+		if (error) {
+			mutex_enter(&mntvnode_lock);
+			nvp = vunmark(mvp);
+			if (error == ENOENT) {
+				goto loop;
+			}
+			continue;
+		}
+		if (waitfor == MNT_LAZY) {
+			error = UFS_WAPBL_BEGIN(vp->v_mount);
+			if (!error) {
+				error = ffs_update(vp, NULL, NULL,
+				    UPDATE_CLOSE);
+				UFS_WAPBL_END(vp->v_mount);
+			}
+		} else {
+			error = VOP_FSYNC(vp, cred, FSYNC_NOLOG |
+			    (waitfor == MNT_WAIT ? FSYNC_WAIT : 0), 0, 0);
+		}
+		if (error)
+			allerror = error;
+		vput(vp);
+		mutex_enter(&mntvnode_lock);
+		nvp = vunmark(mvp);
+	}
+	mutex_exit(&mntvnode_lock);
+	/*
+	 * Force stale file system control information to be flushed.
+	 */
+	if (waitfor != MNT_LAZY && (ump->um_devvp->v_numoutput > 0 ||
+	    !LIST_EMPTY(&ump->um_devvp->v_dirtyblkhd))) {
+		vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+		if ((error = VOP_FSYNC(ump->um_devvp, cred,
+		    (waitfor == MNT_WAIT ? FSYNC_WAIT : 0) | FSYNC_NOLOG,
+		    0, 0)) != 0)
+			allerror = error;
+		VOP_UNLOCK(ump->um_devvp);
+		if (allerror == 0 && waitfor == MNT_WAIT && !mp->mnt_wapbl) {
+			mutex_enter(&mntvnode_lock);
+			goto loop;
+		}
+	}
+#if defined(QUOTA) || defined(QUOTA2)
+	qsync(mp);
+#endif
+	/*
+	 * Write back modified superblock.
+	 */
+	if (fs->fs_fmod != 0) {
+		fs->fs_fmod = 0;
+		fs->fs_time = time_second;
+		error = UFS_WAPBL_BEGIN(mp);
+		if (error)
+			allerror = error;
+		else {
+			if ((error = ffs_cgupdate(ump, waitfor)))
+				allerror = error;
+			UFS_WAPBL_END(mp);
+		}
+	}
+
+#ifdef WAPBL
+	if (mp->mnt_wapbl) {
+		error = wapbl_flush(mp->mnt_wapbl, 0);
+		if (error)
+			allerror = error;
+	}
+#endif
+
+	fstrans_done(mp);
+	vnfree(mvp);
+	return (allerror);
+}
+
+/*
+ * Look up a FFS dinode number to find its incore vnode, otherwise read it
+ * in from disk.  If it is in core, wait for the lock bit to clear, then
+ * return the inode locked.  Detection and handling of mount points must be
+ * done by the calling routine.
+ */
+int
+ffs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
+{
+	struct fs *fs;
+	struct inode *ip;
+	struct ufsmount *ump;
+	struct buf *bp;
+	struct vnode *vp;
+	dev_t dev;
+	int error;
+
+	ump = VFSTOUFS(mp);
+	dev = ump->um_dev;
+
+ retry:
+	if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL)
+		return (0);
+
+	/* Allocate a new vnode/inode. */
+	error = getnewvnode(VT_UFS, mp, ffs_vnodeop_p, NULL, &vp);
+	if (error) {
+		*vpp = NULL;
+		return (error);
+	}
+	ip = pool_cache_get(ffs_inode_cache, PR_WAITOK);
+
+	/*
+	 * If someone beat us to it, put back the freshly allocated
+	 * vnode/inode pair and retry.
+	 */
+	mutex_enter(&ufs_hashlock);
+	if (ufs_ihashget(dev, ino, 0) != NULL) {
+		mutex_exit(&ufs_hashlock);
+		ungetnewvnode(vp);
+		pool_cache_put(ffs_inode_cache, ip);
+		goto retry;
+	}
+
+	vp->v_vflag |= VV_LOCKSWORK;
+
+	/*
+	 * XXX MFS ends up here, too, to allocate an inode.  Should we
+	 * XXX create another pool for MFS inodes?
+	 */
+
+	memset(ip, 0, sizeof(struct inode));
+	vp->v_data = ip;
+	ip->i_vnode = vp;
+	ip->i_ump = ump;
+	ip->i_fs = fs = ump->um_fs;
+	ip->i_dev = dev;
+	ip->i_number = ino;
+#if defined(QUOTA) || defined(QUOTA2)
+	ufsquota_init(ip);
+#endif
+
+	/*
+	 * Initialize genfs node, we might proceed to destroy it in
+	 * error branches.
+	 */
+	genfs_node_init(vp, &ffs_genfsops);
+
+	/*
+	 * Put it onto its hash chain and lock it so that other requests for
+	 * this inode will block if they arrive while we are sleeping waiting
+	 * for old data structures to be purged or for the contents of the
+	 * disk portion of this inode to be read.
+	 */
+
+	ufs_ihashins(ip);
+	mutex_exit(&ufs_hashlock);
+
+	/* Read in the disk contents for the inode, copy into the inode. */
+	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
+		      (int)fs->fs_bsize, NOCRED, 0, &bp);
+	if (error) {
+
+		/*
+		 * The inode does not contain anything useful, so it would
+		 * be misleading to leave it on its hash chain. With mode
+		 * still zero, it will be unlinked and returned to the free
+		 * list by vput().
+		 */
+
+		vput(vp);
+		brelse(bp, 0);
+		*vpp = NULL;
+		return (error);
+	}
+	if (ip->i_ump->um_fstype == UFS1)
+		ip->i_din.ffs1_din = pool_cache_get(ffs_dinode1_cache,
+		    PR_WAITOK);
+	else
+		ip->i_din.ffs2_din = pool_cache_get(ffs_dinode2_cache,
+		    PR_WAITOK);
+	ffs_load_inode(bp, ip, fs, ino);
+	brelse(bp, 0);
+
+	/*
+	 * Initialize the vnode from the inode, check for aliases.
+	 * Note that the underlying vnode may have changed.
+	 */
+
+	ufs_vinit(mp, ffs_specop_p, ffs_fifoop_p, &vp);
+
+	/*
+	 * Finish inode initialization now that aliasing has been resolved.
+	 */
+
+	ip->i_devvp = ump->um_devvp;
+	vref(ip->i_devvp);
+
+	/*
+	 * Ensure that uid and gid are correct. This is a temporary
+	 * fix until fsck has been changed to do the update.
+	 */
+
+	if (fs->fs_old_inodefmt < FS_44INODEFMT) {		/* XXX */
+		ip->i_uid = ip->i_ffs1_ouid;			/* XXX */
+		ip->i_gid = ip->i_ffs1_ogid;			/* XXX */
+	}							/* XXX */
+	uvm_vnp_setsize(vp, ip->i_size);
+	*vpp = vp;
+	return (0);
+}
+
+/*
+ * File handle to vnode
+ *
+ * Have to be really careful about stale file handles:
+ * - check that the inode number is valid
+ * - call ffs_vget() to get the locked inode
+ * - check for an unallocated inode (i_mode == 0)
+ * - check that the given client host has export rights and return
+ *   those rights via. exflagsp and credanonp
+ */
+int
+ffs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
+{
+	struct ufid ufh;
+	struct fs *fs;
+
+	if (fhp->fid_len != sizeof(struct ufid))
+		return EINVAL;
+
+	memcpy(&ufh, fhp, sizeof(ufh));
+	fs = VFSTOUFS(mp)->um_fs;
+	if (ufh.ufid_ino < ROOTINO ||
+	    ufh.ufid_ino >= fs->fs_ncg * fs->fs_ipg)
+		return (ESTALE);
+	return (ufs_fhtovp(mp, &ufh, vpp));
+}
+
+/*
+ * Vnode pointer to File handle
+ */
+/* ARGSUSED */
+int
+ffs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
+{
+	struct inode *ip;
+	struct ufid ufh;
+
+	if (*fh_size < sizeof(struct ufid)) {
+		*fh_size = sizeof(struct ufid);
+		return E2BIG;
+	}
+	ip = VTOI(vp);
+	*fh_size = sizeof(struct ufid);
+	memset(&ufh, 0, sizeof(ufh));
+	ufh.ufid_len = sizeof(struct ufid);
+	ufh.ufid_ino = ip->i_number;
+	ufh.ufid_gen = ip->i_gen;
+	memcpy(fhp, &ufh, sizeof(ufh));
+	return (0);
+}
+
+void
+ffs_init(void)
+{
+	if (ffs_initcount++ > 0)
+		return;
+
+	ffs_inode_cache = pool_cache_init(sizeof(struct inode), 0, 0, 0,
+	    "ffsino", NULL, IPL_NONE, NULL, NULL, NULL);
+	ffs_dinode1_cache = pool_cache_init(sizeof(struct ufs1_dinode), 0, 0, 0,
+	    "ffsdino1", NULL, IPL_NONE, NULL, NULL, NULL);
+	ffs_dinode2_cache = pool_cache_init(sizeof(struct ufs2_dinode), 0, 0, 0,
+	    "ffsdino2", NULL, IPL_NONE, NULL, NULL, NULL);
+	ufs_init();
+}
+
+void
+ffs_reinit(void)
+{
+
+	ufs_reinit();
+}
+
+void
+ffs_done(void)
+{
+	if (--ffs_initcount > 0)
+		return;
+
+	ufs_done();
+	pool_cache_destroy(ffs_dinode2_cache);
+	pool_cache_destroy(ffs_dinode1_cache);
+	pool_cache_destroy(ffs_inode_cache);
+}
+
+/*
+ * Write a superblock and associated information back to disk.
+ */
+int
+ffs_sbupdate(struct ufsmount *mp, int waitfor)
+{
+	struct fs *fs = mp->um_fs;
+	struct buf *bp;
+	int error = 0;
+	u_int32_t saveflag;
+
+	error = ffs_getblk(mp->um_devvp,
+	    fs->fs_sblockloc / DEV_BSIZE, FFS_NOBLK,
+	    fs->fs_sbsize, false, &bp);
+	if (error)
+		return error;
+	saveflag = fs->fs_flags & FS_INTERNAL;
+	fs->fs_flags &= ~FS_INTERNAL;
+
+	memcpy(bp->b_data, fs, fs->fs_sbsize);
+
+	ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
+#ifdef FFS_EI
+	if (mp->um_flags & UFS_NEEDSWAP)
+		ffs_sb_swap((struct fs *)bp->b_data, (struct fs *)bp->b_data);
+#endif
+	fs->fs_flags |= saveflag;
+
+	if (waitfor == MNT_WAIT)
+		error = bwrite(bp);
+	else
+		bawrite(bp);
+	return (error);
+}
+
+int
+ffs_cgupdate(struct ufsmount *mp, int waitfor)
+{
+	struct fs *fs = mp->um_fs;
+	struct buf *bp;
+	int blks;
+	void *space;
+	int i, size, error = 0, allerror = 0;
+
+	allerror = ffs_sbupdate(mp, waitfor);
+	blks = howmany(fs->fs_cssize, fs->fs_fsize);
+	space = fs->fs_csp;
+	for (i = 0; i < blks; i += fs->fs_frag) {
+		size = fs->fs_bsize;
+		if (i + fs->fs_frag > blks)
+			size = (blks - i) * fs->fs_fsize;
+		error = ffs_getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i),
+		    FFS_NOBLK, size, false, &bp);
+		if (error)
+			break;
+#ifdef FFS_EI
+		if (mp->um_flags & UFS_NEEDSWAP)
+			ffs_csum_swap((struct csum*)space,
+			    (struct csum*)bp->b_data, size);
+		else
+#endif
+			memcpy(bp->b_data, space, (u_int)size);
+		space = (char *)space + size;
+		if (waitfor == MNT_WAIT)
+			error = bwrite(bp);
+		else
+			bawrite(bp);
+	}
+	if (!allerror && error)
+		allerror = error;
+	return (allerror);
+}
+
+int
+ffs_extattrctl(struct mount *mp, int cmd, struct vnode *vp,
+    int attrnamespace, const char *attrname)
+{
+#ifdef UFS_EXTATTR
+	/*
+	 * File-backed extended attributes are only supported on UFS1.
+	 * UFS2 has native extended attributes.
+	 */
+	if (VFSTOUFS(mp)->um_fstype == UFS1)
+		return (ufs_extattrctl(mp, cmd, vp, attrnamespace, attrname));
+#endif
+	return (vfs_stdextattrctl(mp, cmd, vp, attrnamespace, attrname));
+}
+
+int
+ffs_suspendctl(struct mount *mp, int cmd)
+{
+	int error;
+	struct lwp *l = curlwp;
+
+	switch (cmd) {
+	case SUSPEND_SUSPEND:
+		if ((error = fstrans_setstate(mp, FSTRANS_SUSPENDING)) != 0)
+			return error;
+		error = ffs_sync(mp, MNT_WAIT, l->l_proc->p_cred);
+		if (error == 0)
+			error = fstrans_setstate(mp, FSTRANS_SUSPENDED);
+#ifdef WAPBL
+		if (error == 0 && mp->mnt_wapbl)
+			error = wapbl_flush(mp->mnt_wapbl, 1);
+#endif
+		if (error != 0) {
+			(void) fstrans_setstate(mp, FSTRANS_NORMAL);
+			return error;
+		}
+		return 0;
+
+	case SUSPEND_RESUME:
+		return fstrans_setstate(mp, FSTRANS_NORMAL);
+
+	default:
+		return EINVAL;
+	}
+}
+
+/*
+ * Synch vnode for a mounted file system.
+ */
+static int
+ffs_vfs_fsync(vnode_t *vp, int flags)
+{
+	int error, i, pflags;
+#ifdef WAPBL
+	struct mount *mp;
+#endif
+
+	KASSERT(vp->v_type == VBLK);
+	KASSERT(vp->v_specmountpoint != NULL);
+
+	/*
+	 * Flush all dirty data associated with the vnode.
+	 */
+	pflags = PGO_ALLPAGES | PGO_CLEANIT;
+	if ((flags & FSYNC_WAIT) != 0)
+		pflags |= PGO_SYNCIO;
+	mutex_enter(vp->v_interlock);
+	error = VOP_PUTPAGES(vp, 0, 0, pflags);
+	if (error)
+		return error;
+
+#ifdef WAPBL
+	mp = vp->v_specmountpoint;
+	if (mp && mp->mnt_wapbl) {
+		/*
+		 * Don't bother writing out metadata if the syncer is
+		 * making the request.  We will let the sync vnode
+		 * write it out in a single burst through a call to
+		 * VFS_SYNC().
+		 */
+		if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY | FSYNC_NOLOG)) != 0)
+			return 0;
+
+		/*
+		 * Don't flush the log if the vnode being flushed
+		 * contains no dirty buffers that could be in the log.
+		 */
+		if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
+			error = wapbl_flush(mp->mnt_wapbl, 0);
+			if (error)
+				return error;
+		}
+
+		if ((flags & FSYNC_WAIT) != 0) {
+			mutex_enter(vp->v_interlock);
+			while (vp->v_numoutput)
+				cv_wait(&vp->v_cv, vp->v_interlock);
+			mutex_exit(vp->v_interlock);
+		}
+
+		return 0;
+	}
+#endif /* WAPBL */
+
+	error = vflushbuf(vp, (flags & FSYNC_WAIT) != 0);
+	if (error == 0 && (flags & FSYNC_CACHE) != 0) {
+		i = 1;
+		(void)VOP_IOCTL(vp, DIOCCACHESYNC, &i, FWRITE,
+		    kauth_cred_get());
+	}
+
+	return error;
+}
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
new file mode 100644
index 000000000..9acc0bdce
--- /dev/null
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -0,0 +1,785 @@
+/*	$NetBSD: ffs_vnops.c,v 1.120 2011/06/27 16:34:47 manu Exp $	*/
+
+/*-
+ * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc, and by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ffs_vnops.c	8.15 (Berkeley) 5/14/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.120 2011/06/27 16:34:47 manu Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_wapbl.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/event.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/pool.h>
+#include <sys/signalvar.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#include <uvm/uvm.h>
+
+/* Global vfs data structures for ufs. */
+int (**ffs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
+	{ &vop_default_desc, vn_default_error },
+	{ &vop_lookup_desc, ufs_lookup },		/* lookup */
+	{ &vop_create_desc, ufs_create },		/* create */
+	{ &vop_whiteout_desc, ufs_whiteout },		/* whiteout */
+	{ &vop_mknod_desc, ufs_mknod },			/* mknod */
+	{ &vop_open_desc, ufs_open },			/* open */
+	{ &vop_close_desc, ufs_close },			/* close */
+	{ &vop_access_desc, ufs_access },		/* access */
+	{ &vop_getattr_desc, ufs_getattr },		/* getattr */
+	{ &vop_setattr_desc, ufs_setattr },		/* setattr */
+	{ &vop_read_desc, ffs_read },			/* read */
+	{ &vop_write_desc, ffs_write },			/* write */
+	{ &vop_ioctl_desc, ufs_ioctl },			/* ioctl */
+	{ &vop_fcntl_desc, ufs_fcntl },			/* fcntl */
+	{ &vop_poll_desc, ufs_poll },			/* poll */
+	{ &vop_kqfilter_desc, genfs_kqfilter },		/* kqfilter */
+	{ &vop_revoke_desc, ufs_revoke },		/* revoke */
+	{ &vop_mmap_desc, ufs_mmap },			/* mmap */
+	{ &vop_fsync_desc, ffs_fsync },			/* fsync */
+	{ &vop_seek_desc, ufs_seek },			/* seek */
+	{ &vop_remove_desc, ufs_remove },		/* remove */
+	{ &vop_link_desc, ufs_link },			/* link */
+	{ &vop_rename_desc, ufs_rename },		/* rename */
+	{ &vop_mkdir_desc, ufs_mkdir },			/* mkdir */
+	{ &vop_rmdir_desc, ufs_rmdir },			/* rmdir */
+	{ &vop_symlink_desc, ufs_symlink },		/* symlink */
+	{ &vop_readdir_desc, ufs_readdir },		/* readdir */
+	{ &vop_readlink_desc, ufs_readlink },		/* readlink */
+	{ &vop_abortop_desc, ufs_abortop },		/* abortop */
+	{ &vop_inactive_desc, ufs_inactive },		/* inactive */
+	{ &vop_reclaim_desc, ffs_reclaim },		/* reclaim */
+	{ &vop_lock_desc, ufs_lock },			/* lock */
+	{ &vop_unlock_desc, ufs_unlock },		/* unlock */
+	{ &vop_bmap_desc, ufs_bmap },			/* bmap */
+	{ &vop_strategy_desc, ufs_strategy },		/* strategy */
+	{ &vop_print_desc, ufs_print },			/* print */
+	{ &vop_islocked_desc, ufs_islocked },		/* islocked */
+	{ &vop_pathconf_desc, ufs_pathconf },		/* pathconf */
+	{ &vop_advlock_desc, ufs_advlock },		/* advlock */
+	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
+	{ &vop_getpages_desc, genfs_getpages },		/* getpages */
+	{ &vop_putpages_desc, genfs_putpages },		/* putpages */
+	{ &vop_openextattr_desc, ffs_openextattr },	/* openextattr */
+	{ &vop_closeextattr_desc, ffs_closeextattr },	/* closeextattr */
+	{ &vop_getextattr_desc, ffs_getextattr },	/* getextattr */
+	{ &vop_setextattr_desc, ffs_setextattr },	/* setextattr */
+	{ &vop_listextattr_desc, ffs_listextattr },	/* listextattr */
+	{ &vop_deleteextattr_desc, ffs_deleteextattr },	/* deleteextattr */
+	{ NULL, NULL }
+};
+const struct vnodeopv_desc ffs_vnodeop_opv_desc =
+	{ &ffs_vnodeop_p, ffs_vnodeop_entries };
+
+int (**ffs_specop_p)(void *);
+const struct vnodeopv_entry_desc ffs_specop_entries[] = {
+	{ &vop_default_desc, vn_default_error },
+	{ &vop_lookup_desc, spec_lookup },		/* lookup */
+	{ &vop_create_desc, spec_create },		/* create */
+	{ &vop_mknod_desc, spec_mknod },		/* mknod */
+	{ &vop_open_desc, spec_open },			/* open */
+	{ &vop_close_desc, ufsspec_close },		/* close */
+	{ &vop_access_desc, ufs_access },		/* access */
+	{ &vop_getattr_desc, ufs_getattr },		/* getattr */
+	{ &vop_setattr_desc, ufs_setattr },		/* setattr */
+	{ &vop_read_desc, ufsspec_read },		/* read */
+	{ &vop_write_desc, ufsspec_write },		/* write */
+	{ &vop_ioctl_desc, spec_ioctl },		/* ioctl */
+	{ &vop_fcntl_desc, ufs_fcntl },			/* fcntl */
+	{ &vop_poll_desc, spec_poll },			/* poll */
+	{ &vop_kqfilter_desc, spec_kqfilter },		/* kqfilter */
+	{ &vop_revoke_desc, spec_revoke },		/* revoke */
+	{ &vop_mmap_desc, spec_mmap },			/* mmap */
+	{ &vop_fsync_desc, ffs_spec_fsync },		/* fsync */
+	{ &vop_seek_desc, spec_seek },			/* seek */
+	{ &vop_remove_desc, spec_remove },		/* remove */
+	{ &vop_link_desc, spec_link },			/* link */
+	{ &vop_rename_desc, spec_rename },		/* rename */
+	{ &vop_mkdir_desc, spec_mkdir },		/* mkdir */
+	{ &vop_rmdir_desc, spec_rmdir },		/* rmdir */
+	{ &vop_symlink_desc, spec_symlink },		/* symlink */
+	{ &vop_readdir_desc, spec_readdir },		/* readdir */
+	{ &vop_readlink_desc, spec_readlink },		/* readlink */
+	{ &vop_abortop_desc, spec_abortop },		/* abortop */
+	{ &vop_inactive_desc, ufs_inactive },		/* inactive */
+	{ &vop_reclaim_desc, ffs_reclaim },		/* reclaim */
+	{ &vop_lock_desc, ufs_lock },			/* lock */
+	{ &vop_unlock_desc, ufs_unlock },		/* unlock */
+	{ &vop_bmap_desc, spec_bmap },			/* bmap */
+	{ &vop_strategy_desc, spec_strategy },		/* strategy */
+	{ &vop_print_desc, ufs_print },			/* print */
+	{ &vop_islocked_desc, ufs_islocked },		/* islocked */
+	{ &vop_pathconf_desc, spec_pathconf },		/* pathconf */
+	{ &vop_advlock_desc, spec_advlock },		/* advlock */
+	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
+	{ &vop_getpages_desc, spec_getpages },		/* getpages */
+	{ &vop_putpages_desc, spec_putpages },		/* putpages */
+	{ &vop_openextattr_desc, ffs_openextattr },	/* openextattr */
+	{ &vop_closeextattr_desc, ffs_closeextattr },	/* closeextattr */
+	{ &vop_getextattr_desc, ffs_getextattr },	/* getextattr */
+	{ &vop_setextattr_desc, ffs_setextattr },	/* setextattr */
+	{ &vop_listextattr_desc, ffs_listextattr },	/* listextattr */
+	{ &vop_deleteextattr_desc, ffs_deleteextattr },	/* deleteextattr */
+	{ NULL, NULL }
+};
+const struct vnodeopv_desc ffs_specop_opv_desc =
+	{ &ffs_specop_p, ffs_specop_entries };
+
+int (**ffs_fifoop_p)(void *);
+const struct vnodeopv_entry_desc ffs_fifoop_entries[] = {
+	{ &vop_default_desc, vn_default_error },
+	{ &vop_lookup_desc, vn_fifo_bypass },		/* lookup */
+	{ &vop_create_desc, vn_fifo_bypass },		/* create */
+	{ &vop_mknod_desc, vn_fifo_bypass },		/* mknod */
+	{ &vop_open_desc, vn_fifo_bypass },		/* open */
+	{ &vop_close_desc, ufsfifo_close },		/* close */
+	{ &vop_access_desc, ufs_access },		/* access */
+	{ &vop_getattr_desc, ufs_getattr },		/* getattr */
+	{ &vop_setattr_desc, ufs_setattr },		/* setattr */
+	{ &vop_read_desc, ufsfifo_read },		/* read */
+	{ &vop_write_desc, ufsfifo_write },		/* write */
+	{ &vop_ioctl_desc, vn_fifo_bypass },		/* ioctl */
+	{ &vop_fcntl_desc, ufs_fcntl },			/* fcntl */
+	{ &vop_poll_desc, vn_fifo_bypass },		/* poll */
+	{ &vop_kqfilter_desc, vn_fifo_bypass },		/* kqfilter */
+	{ &vop_revoke_desc, vn_fifo_bypass },		/* revoke */
+	{ &vop_mmap_desc, vn_fifo_bypass },		/* mmap */
+	{ &vop_fsync_desc, ffs_fsync },			/* fsync */
+	{ &vop_seek_desc, vn_fifo_bypass },		/* seek */
+	{ &vop_remove_desc, vn_fifo_bypass },		/* remove */
+	{ &vop_link_desc, vn_fifo_bypass },		/* link */
+	{ &vop_rename_desc, vn_fifo_bypass },		/* rename */
+	{ &vop_mkdir_desc, vn_fifo_bypass },		/* mkdir */
+	{ &vop_rmdir_desc, vn_fifo_bypass },		/* rmdir */
+	{ &vop_symlink_desc, vn_fifo_bypass },		/* symlink */
+	{ &vop_readdir_desc, vn_fifo_bypass },		/* readdir */
+	{ &vop_readlink_desc, vn_fifo_bypass },		/* readlink */
+	{ &vop_abortop_desc, vn_fifo_bypass },		/* abortop */
+	{ &vop_inactive_desc, ufs_inactive },		/* inactive */
+	{ &vop_reclaim_desc, ffs_reclaim },		/* reclaim */
+	{ &vop_lock_desc, ufs_lock },			/* lock */
+	{ &vop_unlock_desc, ufs_unlock },		/* unlock */
+	{ &vop_bmap_desc, vn_fifo_bypass },		/* bmap */
+	{ &vop_strategy_desc, vn_fifo_bypass },		/* strategy */
+	{ &vop_print_desc, ufs_print },			/* print */
+	{ &vop_islocked_desc, ufs_islocked },		/* islocked */
+	{ &vop_pathconf_desc, vn_fifo_bypass },		/* pathconf */
+	{ &vop_advlock_desc, vn_fifo_bypass },		/* advlock */
+	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
+	{ &vop_putpages_desc, vn_fifo_bypass }, 	/* putpages */
+	{ &vop_openextattr_desc, ffs_openextattr },	/* openextattr */
+	{ &vop_closeextattr_desc, ffs_closeextattr },	/* closeextattr */
+	{ &vop_getextattr_desc, ffs_getextattr },	/* getextattr */
+	{ &vop_setextattr_desc, ffs_setextattr },	/* setextattr */
+	{ &vop_listextattr_desc, ffs_listextattr },	/* listextattr */
+	{ &vop_deleteextattr_desc, ffs_deleteextattr },	/* deleteextattr */
+	{ NULL, NULL }
+};
+const struct vnodeopv_desc ffs_fifoop_opv_desc =
+	{ &ffs_fifoop_p, ffs_fifoop_entries };
+
+#include <ufs/ufs/ufs_readwrite.c>
+
+int
+ffs_spec_fsync(void *v)
+{
+	struct vop_fsync_args /* {
+		struct vnode *a_vp;
+		kauth_cred_t a_cred;
+		int a_flags;
+		off_t a_offlo;
+		off_t a_offhi;
+		struct lwp *a_l;
+	} */ *ap = v;
+	int error, flags, uflags;
+	struct vnode *vp;
+	struct mount *mp;
+
+	flags = ap->a_flags;
+	uflags = UPDATE_CLOSE | ((flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);
+	vp = ap->a_vp;
+	mp = vp->v_mount;
+
+	fstrans_start(mp, FSTRANS_LAZY);
+
+	error = spec_fsync(v);
+	if (error)
+		goto out;
+
+#ifdef WAPBL
+	if (mp && mp->mnt_wapbl) {
+		/*
+		 * Don't bother writing out metadata if the syncer is
+		 * making the request.  We will let the sync vnode
+		 * write it out in a single burst through a call to
+		 * VFS_SYNC().
+		 */
+		if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0)
+			goto out;
+		if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE
+		    | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) {
+			error = UFS_WAPBL_BEGIN(mp);
+			if (error != 0)
+				goto out;
+			error = ffs_update(vp, NULL, NULL, uflags);
+			UFS_WAPBL_END(mp);
+		}
+		goto out;
+	}
+#endif /* WAPBL */
+
+	error = ffs_update(vp, NULL, NULL, uflags);
+
+out:
+	fstrans_done(mp);
+	return error;
+}
+
+int
+ffs_fsync(void *v)
+{
+	struct vop_fsync_args /* {
+		struct vnode *a_vp;
+		kauth_cred_t a_cred;
+		int a_flags;
+		off_t a_offlo;
+		off_t a_offhi;
+		struct lwp *a_l;
+	} */ *ap = v;
+	struct buf *bp;
+	int num, error, i;
+	struct indir ia[NIADDR + 1];
+	int bsize;
+	daddr_t blk_high;
+	struct vnode *vp;
+	struct mount *mp;
+
+	vp = ap->a_vp;
+	mp = vp->v_mount;
+
+	fstrans_start(mp, FSTRANS_LAZY);
+	if ((ap->a_offlo == 0 && ap->a_offhi == 0) || (vp->v_type != VREG)) {
+		error = ffs_full_fsync(vp, ap->a_flags);
+		goto out;
+	}
+
+	bsize = mp->mnt_stat.f_iosize;
+	blk_high = ap->a_offhi / bsize;
+	if (ap->a_offhi % bsize != 0)
+		blk_high++;
+
+	/*
+	 * First, flush all pages in range.
+	 */
+
+	mutex_enter(vp->v_interlock);
+	error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
+	    round_page(ap->a_offhi), PGO_CLEANIT |
+	    ((ap->a_flags & FSYNC_WAIT) ? PGO_SYNCIO : 0));
+	if (error) {
+		goto out;
+	}
+
+#ifdef WAPBL
+	KASSERT(vp->v_type == VREG);
+	if (mp->mnt_wapbl) {
+		/*
+		 * Don't bother writing out metadata if the syncer is
+		 * making the request.  We will let the sync vnode
+		 * write it out in a single burst through a call to
+		 * VFS_SYNC().
+		 */
+		if ((ap->a_flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0) {
+			fstrans_done(mp);
+			return 0;
+		}
+		error = 0;
+		if (vp->v_tag == VT_UFS && VTOI(vp)->i_flag &
+		    (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY |
+				 IN_MODIFIED | IN_ACCESSED)) {
+			error = UFS_WAPBL_BEGIN(mp);
+			if (error) {
+				fstrans_done(mp);
+				return error;
+			}
+			error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE |
+			    ((ap->a_flags & FSYNC_WAIT) ? UPDATE_WAIT : 0));
+			UFS_WAPBL_END(mp);
+		}
+		if (error || (ap->a_flags & FSYNC_NOLOG) != 0) {
+			fstrans_done(mp);
+			return error;
+		}
+		error = wapbl_flush(mp->mnt_wapbl, 0);
+		fstrans_done(mp);
+		return error;
+	}
+#endif /* WAPBL */
+
+	/*
+	 * Then, flush indirect blocks.
+	 */
+
+	if (blk_high >= NDADDR) {
+		error = ufs_getlbns(vp, blk_high, ia, &num);
+		if (error)
+			goto out;
+
+		mutex_enter(&bufcache_lock);
+		for (i = 0; i < num; i++) {
+			if ((bp = incore(vp, ia[i].in_lbn)) == NULL)
+				continue;
+			if ((bp->b_cflags & BC_BUSY) != 0 ||
+			    (bp->b_oflags & BO_DELWRI) == 0)
+				continue;
+			bp->b_cflags |= BC_BUSY | BC_VFLUSH;
+			mutex_exit(&bufcache_lock);
+			bawrite(bp);
+			mutex_enter(&bufcache_lock);
+		}
+		mutex_exit(&bufcache_lock);
+	}
+
+	if (ap->a_flags & FSYNC_WAIT) {
+		mutex_enter(vp->v_interlock);
+		while (vp->v_numoutput > 0)
+			cv_wait(&vp->v_cv, vp->v_interlock);
+		mutex_exit(vp->v_interlock);
+	}
+
+	error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE |
+	    (((ap->a_flags & (FSYNC_WAIT | FSYNC_DATAONLY)) == FSYNC_WAIT)
+	    ? UPDATE_WAIT : 0));
+
+	if (error == 0 && ap->a_flags & FSYNC_CACHE) {
+		int l = 0;
+		VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE,
+			curlwp->l_cred);
+	}
+
+out:
+	fstrans_done(mp);
+	return error;
+}
+
+/*
+ * Synch an open file.  Called for VOP_FSYNC().
+ */
+/* ARGSUSED */
+int
+ffs_full_fsync(struct vnode *vp, int flags)
+{
+	int error, i, uflags;
+	struct mount *mp;
+
+	KASSERT(vp->v_tag == VT_UFS);
+	KASSERT(VTOI(vp) != NULL);
+	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK);
+
+	error = 0;
+	uflags = UPDATE_CLOSE | ((flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);
+
+	mp = vp->v_mount;
+
+	/*
+	 * Flush all dirty data associated with the vnode.
+	 */
+	if (vp->v_type == VREG) {
+		int pflags = PGO_ALLPAGES | PGO_CLEANIT;
+
+		if ((flags & FSYNC_WAIT))
+			pflags |= PGO_SYNCIO;
+		if (fstrans_getstate(mp) == FSTRANS_SUSPENDING)
+			pflags |= PGO_FREE;
+		mutex_enter(vp->v_interlock);
+		error = VOP_PUTPAGES(vp, 0, 0, pflags);
+		if (error)
+			return error;
+	}
+
+#ifdef WAPBL
+	if (mp && mp->mnt_wapbl) {
+		/*
+		 * Don't bother writing out metadata if the syncer is
+		 * making the request.  We will let the sync vnode
+		 * write it out in a single burst through a call to
+		 * VFS_SYNC().
+		 */
+		if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0)
+			return 0;
+
+		if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE
+		    | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) {
+			error = UFS_WAPBL_BEGIN(mp);
+			if (error)
+				return error;
+			error = ffs_update(vp, NULL, NULL, uflags);
+			UFS_WAPBL_END(mp);
+		}
+		if (error || (flags & FSYNC_NOLOG) != 0)
+			return error;
+
+		/*
+		 * Don't flush the log if the vnode being flushed
+		 * contains no dirty buffers that could be in the log.
+		 */
+		if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
+			error = wapbl_flush(mp->mnt_wapbl, 0);
+			if (error)
+				return error;
+		}
+
+		if ((flags & FSYNC_WAIT) != 0) {
+			mutex_enter(vp->v_interlock);
+			while (vp->v_numoutput != 0)
+				cv_wait(&vp->v_cv, vp->v_interlock);
+			mutex_exit(vp->v_interlock);
+		}
+
+		return error;
+	}
+#endif /* WAPBL */
+
+	error = vflushbuf(vp, (flags & FSYNC_WAIT) != 0);
+	if (error == 0)
+		error = ffs_update(vp, NULL, NULL, uflags);
+	if (error == 0 && (flags & FSYNC_CACHE) != 0) {
+		i = 1;
+		(void)VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &i, FWRITE,
+		    kauth_cred_get());
+	}
+
+	return error;
+}
+
+/*
+ * Reclaim an inode so that it can be used for other purposes.
+ */
+int
+ffs_reclaim(void *v)
+{
+	struct vop_reclaim_args /* {
+		struct vnode *a_vp;
+		struct lwp *a_l;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	struct mount *mp = vp->v_mount;
+	struct ufsmount *ump = ip->i_ump;
+	void *data;
+	int error;
+
+	fstrans_start(mp, FSTRANS_LAZY);
+	/*
+	 * The inode must be freed and updated before being removed
+	 * from its hash chain.  Other threads trying to gain a hold
+	 * on the inode will be stalled because it is locked (VI_XLOCK).
+	 */
+	error = UFS_WAPBL_BEGIN(mp);
+	if (error) {
+		fstrans_done(mp);
+		return error;
+	}
+	if (ip->i_nlink <= 0 && ip->i_omode != 0 &&
+	    (vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
+		ffs_vfree(vp, ip->i_number, ip->i_omode);
+	UFS_WAPBL_END(mp);
+	if ((error = ufs_reclaim(vp)) != 0) {
+		fstrans_done(mp);
+		return (error);
+	}
+	if (ip->i_din.ffs1_din != NULL) {
+		if (ump->um_fstype == UFS1)
+			pool_cache_put(ffs_dinode1_cache, ip->i_din.ffs1_din);
+		else
+			pool_cache_put(ffs_dinode2_cache, ip->i_din.ffs2_din);
+	}
+	/*
+	 * To interlock with ffs_sync().
+	 */
+	genfs_node_destroy(vp);
+	mutex_enter(vp->v_interlock);
+	data = vp->v_data;
+	vp->v_data = NULL;
+	mutex_exit(vp->v_interlock);
+
+	/*
+	 * XXX MFS ends up here, too, to free an inode.  Should we create
+	 * XXX a separate pool for MFS inodes?
+	 */
+	pool_cache_put(ffs_inode_cache, data);
+	fstrans_done(mp);
+	return (0);
+}
+
+/*
+ * Return the last logical file offset that should be written for this file
+ * if we're doing a write that ends at "size".
+ */
+
+void
+ffs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
+{
+	struct inode *ip = VTOI(vp);
+	struct fs *fs = ip->i_fs;
+	daddr_t olbn, nlbn;
+
+	olbn = lblkno(fs, ip->i_size);
+	nlbn = lblkno(fs, size);
+	if (nlbn < NDADDR && olbn <= nlbn) {
+		*eobp = fragroundup(fs, size);
+	} else {
+		*eobp = blkroundup(fs, size);
+	}
+}
+
+int
+ffs_openextattr(void *v)
+{
+	struct vop_openextattr_args /* {
+		struct vnode *a_vp;
+		kauth_cred_t a_cred;
+		struct proc *a_p;
+	} */ *ap = v;
+	struct inode *ip = VTOI(ap->a_vp);
+	struct fs *fs = ip->i_fs;
+
+	/* Not supported for UFS1 file systems. */
+	if (fs->fs_magic == FS_UFS1_MAGIC)
+		return (EOPNOTSUPP);
+
+	/* XXX Not implemented for UFS2 file systems. */
+	return (EOPNOTSUPP);
+}
+
+int
+ffs_closeextattr(void *v)
+{
+	struct vop_closeextattr_args /* {
+		struct vnode *a_vp;
+		int a_commit;
+		kauth_cred_t a_cred;
+		struct proc *a_p;
+	} */ *ap = v;
+	struct inode *ip = VTOI(ap->a_vp);
+	struct fs *fs = ip->i_fs;
+
+	/* Not supported for UFS1 file systems. */
+	if (fs->fs_magic == FS_UFS1_MAGIC)
+		return (EOPNOTSUPP);
+
+	/* XXX Not implemented for UFS2 file systems. */
+	return (EOPNOTSUPP);
+}
+
+int
+ffs_getextattr(void *v)
+{
+	struct vop_getextattr_args /* {
+		struct vnode *a_vp;
+		int a_attrnamespace;
+		const char *a_name;
+		struct uio *a_uio;
+		size_t *a_size;
+		kauth_cred_t a_cred;
+		struct proc *a_p;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	struct fs *fs = ip->i_fs;
+
+	if (fs->fs_magic == FS_UFS1_MAGIC) {
+#ifdef UFS_EXTATTR
+		int error;
+
+		fstrans_start(vp->v_mount, FSTRANS_SHARED);
+		error = ufs_getextattr(ap);
+		fstrans_done(vp->v_mount);
+		return error;
+#else
+		return (EOPNOTSUPP);
+#endif
+	}
+
+	/* XXX Not implemented for UFS2 file systems. */
+	return (EOPNOTSUPP);
+}
+
+int
+ffs_setextattr(void *v)
+{
+	struct vop_setextattr_args /* {
+		struct vnode *a_vp;
+		int a_attrnamespace;
+		const char *a_name;
+		struct uio *a_uio;
+		kauth_cred_t a_cred;
+		struct proc *a_p;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	struct fs *fs = ip->i_fs;
+
+	if (fs->fs_magic == FS_UFS1_MAGIC) {
+#ifdef UFS_EXTATTR
+		int error;
+
+		fstrans_start(vp->v_mount, FSTRANS_SHARED);
+		error = ufs_setextattr(ap);
+		fstrans_done(vp->v_mount);
+		return error;
+#else
+		return (EOPNOTSUPP);
+#endif
+	}
+
+	/* XXX Not implemented for UFS2 file systems. */
+	return (EOPNOTSUPP);
+}
+
+int
+ffs_listextattr(void *v)
+{
+	struct vop_listextattr_args /* {
+		struct vnode *a_vp;
+		int a_attrnamespace;
+		struct uio *a_uio;
+		size_t *a_size;
+		kauth_cred_t a_cred;
+		struct proc *a_p;
+	} */ *ap = v;
+	struct inode *ip = VTOI(ap->a_vp);
+	struct fs *fs = ip->i_fs;
+
+	if (fs->fs_magic == FS_UFS1_MAGIC) {
+#ifdef UFS_EXTATTR
+		struct vnode *vp = ap->a_vp;
+		int error;
+
+		fstrans_start(vp->v_mount, FSTRANS_SHARED);
+		error = ufs_listextattr(ap);
+		fstrans_done(vp->v_mount);
+		return error;
+#else
+		return (EOPNOTSUPP);
+#endif
+	}
+
+	/* XXX Not implemented for UFS2 file systems. */
+	return (EOPNOTSUPP);
+}
+
+int
+ffs_deleteextattr(void *v)
+{
+	struct vop_deleteextattr_args /* {
+		struct vnode *a_vp;
+		int a_attrnamespace;
+		kauth_cred_t a_cred;
+		struct proc *a_p;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	struct fs *fs = ip->i_fs;
+
+	if (fs->fs_magic == FS_UFS1_MAGIC) {
+#ifdef UFS_EXTATTR
+		int error;
+
+		fstrans_start(vp->v_mount, FSTRANS_SHARED);
+		error = ufs_deleteextattr(ap);
+		fstrans_done(vp->v_mount);
+		return error;
+#else
+		return (EOPNOTSUPP);
+#endif
+	}
+
+	/* XXX Not implemented for UFS2 file systems. */
+	return (EOPNOTSUPP);
+}
diff --git a/sys/ufs/ffs/ffs_wapbl.c b/sys/ufs/ffs/ffs_wapbl.c
new file mode 100644
index 000000000..aa6b2dae1
--- /dev/null
+++ b/sys/ufs/ffs/ffs_wapbl.c
@@ -0,0 +1,883 @@
+/*	$NetBSD: ffs_wapbl.c,v 1.17 2010/12/24 13:38:57 mlelstv Exp $	*/
+
+/*-
+ * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_wapbl.c,v 1.17 2010/12/24 13:38:57 mlelstv Exp $");
+
+#define WAPBL_INTERNAL
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/file.h>
+#include <sys/disk.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#undef	WAPBL_DEBUG
+#ifdef WAPBL_DEBUG
+int ffs_wapbl_debug = 1;
+#define DPRINTF(fmt, args...)						\
+do {									\
+	if (ffs_wapbl_debug)						\
+		printf("%s:%d "fmt, __func__ , __LINE__, ##args);	\
+} while (/* CONSTCOND */0)
+#else
+#define	DPRINTF(fmt, args...)						\
+do {									\
+	/* nothing */							\
+} while (/* CONSTCOND */0)
+#endif
+
+static int ffs_superblock_layout(struct fs *);
+static int wapbl_log_position(struct mount *, struct fs *, struct vnode *,
+    daddr_t *, size_t *, size_t *, uint64_t *);
+static int wapbl_create_infs_log(struct mount *, struct fs *, struct vnode *,
+    daddr_t *, size_t *, uint64_t *);
+static void wapbl_find_log_start(struct mount *, struct vnode *, off_t,
+    daddr_t *, daddr_t *, size_t *);
+static int wapbl_remove_log(struct mount *);
+static int wapbl_allocate_log_file(struct mount *, struct vnode *,
+    daddr_t *, size_t *, uint64_t *);
+
+/*
+ * Return the super block layout format - UFS1 or UFS2.
+ * WAPBL only works with UFS2 layout (which is still available
+ * with FFSv1).
+ *
+ * XXX Should this be in ufs/ffs/fs.h?  Same style of check is
+ * also used in ffs_alloc.c in a few places.
+ */
+static int
+ffs_superblock_layout(struct fs *fs)
+{
+	if ((fs->fs_magic == FS_UFS1_MAGIC) &&
+	    ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))
+		return 1;
+	else
+		return 2;
+}
+
+/*
+ * This function is invoked after a log is replayed to
+ * disk to perform logical cleanup actions as described by
+ * the log
+ */
+void
+ffs_wapbl_replay_finish(struct mount *mp)
+{
+	struct wapbl_replay *wr = mp->mnt_wapbl_replay;
+	int i;
+	int error;
+
+	if (!wr)
+		return;
+
+	KDASSERT((mp->mnt_flag & MNT_RDONLY) == 0);
+
+	for (i = 0; i < wr->wr_inodescnt; i++) {
+		struct vnode *vp;
+		struct inode *ip;
+		error = VFS_VGET(mp, wr->wr_inodes[i].wr_inumber, &vp);
+		if (error) {
+			printf("ffs_wapbl_replay_finish: "
+			    "unable to cleanup inode %" PRIu32 "\n",
+			    wr->wr_inodes[i].wr_inumber);
+			continue;
+		}
+		ip = VTOI(vp);
+		KDASSERT(wr->wr_inodes[i].wr_inumber == ip->i_number);
+#ifdef WAPBL_DEBUG
+		printf("ffs_wapbl_replay_finish: "
+		    "cleaning inode %" PRIu64 " size=%" PRIu64 " mode=%o nlink=%d\n",
+		    ip->i_number, ip->i_size, ip->i_mode, ip->i_nlink);
+#endif
+		KASSERT(ip->i_nlink == 0);
+
+		/*
+		 * The journal may have left partially allocated inodes in mode
+		 * zero.  This may occur if a crash occurs betweeen the node
+		 * allocation in ffs_nodeallocg and when the node is properly
+		 * initialized in ufs_makeinode.  If so, just dallocate them.
+		 */
+		if (ip->i_mode == 0) {
+			UFS_WAPBL_BEGIN(mp);
+			ffs_vfree(vp, ip->i_number, wr->wr_inodes[i].wr_imode);
+			UFS_WAPBL_END(mp);
+		}
+		vput(vp);
+	}
+	wapbl_replay_stop(wr);
+	wapbl_replay_free(wr);
+	mp->mnt_wapbl_replay = NULL;
+}
+
+/* Callback for wapbl */
+void
+ffs_wapbl_sync_metadata(struct mount *mp, daddr_t *deallocblks,
+    int *dealloclens, int dealloccnt)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	int i, error;
+
+#ifdef WAPBL_DEBUG_INODES
+	ufs_wapbl_verify_inodes(mp, "ffs_wapbl_sync_metadata");
+#endif
+
+	for (i = 0; i< dealloccnt; i++) {
+		/*
+		 * blkfree errors are unreported, might silently fail
+		 * if it cannot read the cylinder group block
+		 */
+		ffs_blkfree(fs, ump->um_devvp,
+		    dbtofsb(fs, deallocblks[i]), dealloclens[i], -1);
+	}
+
+	fs->fs_fmod = 0;
+	fs->fs_time = time_second;
+	error = ffs_cgupdate(ump, 0);
+	KASSERT(error == 0);
+}
+
+void
+ffs_wapbl_abort_sync_metadata(struct mount *mp, daddr_t *deallocblks,
+    int *dealloclens, int dealloccnt)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	int i;
+
+	for (i = 0; i < dealloccnt; i++) {
+		/*
+		 * Since the above blkfree may have failed, this blkalloc might
+		 * fail as well, so don't check its error.  Note that if the
+		 * blkfree succeeded above, then this shouldn't fail because
+		 * the buffer will be locked in the current transaction.
+		 */
+		ffs_blkalloc_ump(ump, dbtofsb(fs, deallocblks[i]),
+		    dealloclens[i]);
+	}
+}
+
+static int
+wapbl_remove_log(struct mount *mp)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	struct vnode *vp;
+	struct inode *ip;
+	ino_t log_ino;
+	int error;
+
+	/* If super block layout is too old to support WAPBL, return */
+	if (ffs_superblock_layout(fs) < 2)
+		return 0;
+
+	/* If all the log locators are 0, just clean up */
+	if (fs->fs_journallocs[0] == 0 &&
+	    fs->fs_journallocs[1] == 0 &&
+	    fs->fs_journallocs[2] == 0 &&
+	    fs->fs_journallocs[3] == 0) {
+		DPRINTF("empty locators, just clear\n");
+		goto done;
+	}
+
+	switch (fs->fs_journal_location) {
+	case UFS_WAPBL_JOURNALLOC_NONE:
+		/* nothing! */
+		DPRINTF("no log\n");
+		break;
+
+	case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
+		log_ino = fs->fs_journallocs[UFS_WAPBL_INFS_INO];
+		DPRINTF("in-fs log, ino = %" PRId64 "\n",log_ino);
+
+		/* if no existing log inode, just clear all fields and bail */
+		if (log_ino == 0)
+			goto done;
+		error = VFS_VGET(mp, log_ino, &vp);
+		if (error != 0) {
+			printf("ffs_wapbl: vget failed %d\n",
+			    error);
+			/* clear out log info on error */
+			goto done;
+		}
+		ip = VTOI(vp);
+		KASSERT(log_ino == ip->i_number);
+		if ((ip->i_flags & SF_LOG) == 0) {
+			printf("ffs_wapbl: try to clear non-log inode "
+			    "%" PRId64 "\n", log_ino);
+			vput(vp);
+			/* clear out log info on error */
+			goto done;
+		}
+
+		/*
+		 * remove the log inode by setting its link count back
+		 * to zero and bail.
+		 */
+		ip->i_nlink = 0;
+		DIP_ASSIGN(ip, nlink, 0);
+		vput(vp);
+
+	case UFS_WAPBL_JOURNALLOC_END_PARTITION:
+		DPRINTF("end-of-partition log\n");
+		/* no extra work required */
+		break;
+
+	default:
+		printf("ffs_wapbl: unknown journal type %d\n",
+		    fs->fs_journal_location);
+		break;
+	}
+
+
+done:
+	/* Clear out all previous knowledge of journal */
+	fs->fs_journal_version = 0;
+	fs->fs_journal_location = 0;
+	fs->fs_journal_flags = 0;
+	fs->fs_journallocs[0] = 0;
+	fs->fs_journallocs[1] = 0;
+	fs->fs_journallocs[2] = 0;
+	fs->fs_journallocs[3] = 0;
+	(void) ffs_sbupdate(ump, MNT_WAIT);
+
+	return 0;
+}
+
+int
+ffs_wapbl_start(struct mount *mp)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	struct vnode *devvp = ump->um_devvp;
+	daddr_t off;
+	size_t count;
+	size_t blksize;
+	uint64_t extradata;
+	int error;
+
+	if (mp->mnt_wapbl == NULL) {
+		if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CLEAR_LOG) {
+			/* Clear out any existing journal file */
+			error = wapbl_remove_log(mp);
+			if (error != 0)
+				return error;
+		}
+
+		if (mp->mnt_flag & MNT_LOG) {
+			KDASSERT(fs->fs_ronly == 0);
+
+			/* WAPBL needs UFS2 format super block */
+			if (ffs_superblock_layout(fs) < 2) {
+				printf("%s fs superblock in old format, "
+				   "not journaling\n",
+				   VFSTOUFS(mp)->um_fs->fs_fsmnt);
+				mp->mnt_flag &= ~MNT_LOG;
+				return EINVAL;
+			}
+
+			error = wapbl_log_position(mp, fs, devvp, &off,
+			    &count, &blksize, &extradata);
+			if (error)
+				return error;
+
+			error = wapbl_start(&mp->mnt_wapbl, mp, devvp, off,
+			    count, blksize, mp->mnt_wapbl_replay,
+			    ffs_wapbl_sync_metadata,
+			    ffs_wapbl_abort_sync_metadata);
+			if (error)
+				return error;
+
+			mp->mnt_wapbl_op = &wapbl_ops;
+
+#ifdef WAPBL_DEBUG
+			printf("%s: enabling logging\n", fs->fs_fsmnt);
+#endif
+
+			if ((fs->fs_flags & FS_DOWAPBL) == 0) {
+				UFS_WAPBL_BEGIN(mp);
+				fs->fs_flags |= FS_DOWAPBL;
+				error = ffs_sbupdate(ump, MNT_WAIT);
+				if (error) {
+					UFS_WAPBL_END(mp);
+					ffs_wapbl_stop(mp, MNT_FORCE);
+					return error;
+				}
+				UFS_WAPBL_END(mp);
+				error = wapbl_flush(mp->mnt_wapbl, 1);
+				if (error) {
+					ffs_wapbl_stop(mp, MNT_FORCE);
+					return error;
+				}
+			}
+		} else if (fs->fs_flags & FS_DOWAPBL) {
+			fs->fs_fmod = 1;
+			fs->fs_flags &= ~FS_DOWAPBL;
+		}
+	}
+
+	/*
+	 * It is recommended that you finish replay with logging enabled.
+	 * However, even if logging is not enabled, the remaining log
+	 * replay should be safely recoverable with an fsck, so perform
+	 * it anyway.
+	 */
+	if ((fs->fs_ronly == 0) && mp->mnt_wapbl_replay) {
+		int saveflag = mp->mnt_flag & MNT_RDONLY;
+		/*
+		 * Make sure MNT_RDONLY is not set so that the inode
+		 * cleanup in ufs_inactive will actually do its work.
+		 */
+		mp->mnt_flag &= ~MNT_RDONLY;
+		ffs_wapbl_replay_finish(mp);
+		mp->mnt_flag |= saveflag;
+		KASSERT(fs->fs_ronly == 0);
+	}
+
+	return 0;
+}
+
+int
+ffs_wapbl_stop(struct mount *mp, int force)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	int error;
+
+	if (mp->mnt_wapbl) {
+		KDASSERT(fs->fs_ronly == 0);
+
+		/*
+		 * Make sure turning off FS_DOWAPBL is only removed
+		 * as the only change in the final flush since otherwise
+		 * a transaction may reorder writes.
+		 */
+		error = wapbl_flush(mp->mnt_wapbl, 1);
+		if (error && !force)
+			return error;
+		if (error && force)
+			goto forceout;
+		error = UFS_WAPBL_BEGIN(mp);
+		if (error && !force)
+			return error;
+		if (error && force)
+			goto forceout;
+		KASSERT(fs->fs_flags & FS_DOWAPBL);
+
+		fs->fs_flags &= ~FS_DOWAPBL;
+		error = ffs_sbupdate(ump, MNT_WAIT);
+		KASSERT(error == 0);	/* XXX a bit drastic! */
+		UFS_WAPBL_END(mp);
+	forceout:
+		error = wapbl_stop(mp->mnt_wapbl, force);
+		if (error) {
+			KASSERT(!force);
+			fs->fs_flags |= FS_DOWAPBL;
+			return error;
+		}
+		fs->fs_flags &= ~FS_DOWAPBL; /* Repeat in case of forced error */
+		mp->mnt_wapbl = NULL;
+
+#ifdef WAPBL_DEBUG
+		printf("%s: disabled logging\n", fs->fs_fsmnt);
+#endif
+	}
+
+	return 0;
+}
+
+int
+ffs_wapbl_replay_start(struct mount *mp, struct fs *fs, struct vnode *devvp)
+{
+	int error;
+	daddr_t off;
+	size_t count;
+	size_t blksize;
+	uint64_t extradata;
+
+	/*
+	 * WAPBL needs UFS2 format super block, if we got here with a
+	 * UFS1 format super block something is amiss...
+	 */
+	if (ffs_superblock_layout(fs) < 2)
+		return EINVAL;
+
+	error = wapbl_log_position(mp, fs, devvp, &off, &count, &blksize,
+	    &extradata);
+
+	if (error)
+		return error;
+
+	error = wapbl_replay_start(&mp->mnt_wapbl_replay, devvp, off,
+		count, blksize);
+	if (error)
+		return error;
+
+	mp->mnt_wapbl_op = &wapbl_ops;
+
+	return 0;
+}
+
+/*
+ * If the superblock doesn't already have a recorded journal location
+ * then we allocate the journal in one of two positions:
+ *
+ *  - At the end of the partition after the filesystem if there's
+ *    enough space.  "Enough space" is defined as >= 1MB of journal
+ *    per 1GB of filesystem or 64MB, whichever is smaller.
+ *
+ *  - Inside the filesystem.  We try to allocate a contiguous journal
+ *    based on the total filesystem size - the target is 1MB of journal
+ *    per 1GB of filesystem, up to a maximum journal size of 64MB.  As
+ *    a worst case allowing for fragmentation, we'll allocate a journal
+ *    1/4 of the desired size but never smaller than 1MB.
+ *
+ *    XXX In the future if we allow for non-contiguous journal files we
+ *    can tighten the above restrictions.
+ *
+ * XXX
+ * These seems like a lot of duplication both here and in some of
+ * the userland tools (fsck_ffs, dumpfs, tunefs) with similar 
+ * "switch (fs_journal_location)" constructs.  Can we centralise
+ * this sort of code somehow/somewhere?
+ */
+static int
+wapbl_log_position(struct mount *mp, struct fs *fs, struct vnode *devvp,
+    daddr_t *startp, size_t *countp, size_t *blksizep, uint64_t *extradatap)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	daddr_t logstart, logend, desired_logsize;
+	uint64_t numsecs;
+	unsigned secsize;
+	int error, location;
+
+	if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
+		switch (fs->fs_journal_location) {
+		case UFS_WAPBL_JOURNALLOC_END_PARTITION:
+			DPRINTF("found existing end-of-partition log\n");
+			*startp = fs->fs_journallocs[UFS_WAPBL_EPART_ADDR];
+			*countp = fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
+			*blksizep = fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ];
+			DPRINTF(" start = %" PRId64 ", size = %zu, "
+			    "blksize = %zu\n", *startp, *countp, *blksizep);
+			return 0;
+
+		case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
+			DPRINTF("found existing in-filesystem log\n");
+			*startp = fs->fs_journallocs[UFS_WAPBL_INFS_ADDR];
+			*countp = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
+			*blksizep = fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
+			DPRINTF(" start = %" PRId64 ", size = %zu, "
+			    "blksize = %zu\n", *startp, *countp, *blksizep);
+			return 0;
+
+		default:
+			printf("ffs_wapbl: unknown journal type %d\n",
+			    fs->fs_journal_location);
+			return EINVAL;
+		}
+	}
+
+	desired_logsize =
+	    lfragtosize(fs, fs->fs_size) / UFS_WAPBL_JOURNAL_SCALE;
+	DPRINTF("desired log size = %" PRId64 " kB\n", desired_logsize / 1024);
+	desired_logsize = max(desired_logsize, UFS_WAPBL_MIN_JOURNAL_SIZE);
+	desired_logsize = min(desired_logsize, UFS_WAPBL_MAX_JOURNAL_SIZE);
+	DPRINTF("adjusted desired log size = %" PRId64 " kB\n",
+	    desired_logsize / 1024);
+
+	/* Is there space after after filesystem on partition for log? */
+	logstart = fsbtodb(fs, fs->fs_size);
+	error = getdisksize(devvp, &numsecs, &secsize);
+	if (error)
+		return error;
+	KDASSERT(secsize != 0);
+	logend = btodb(numsecs * secsize);
+
+	if (dbtob(logend - logstart) >= desired_logsize) {
+		DPRINTF("enough space, use end-of-partition log\n");
+
+		location = UFS_WAPBL_JOURNALLOC_END_PARTITION;
+		*blksizep = secsize;
+
+		*startp = logstart;
+		*countp = (logend - logstart);
+		*extradatap = 0;
+
+		/* convert to physical block numbers */
+		*startp = dbtob(*startp) / secsize;
+		*countp = dbtob(*countp) / secsize;
+
+		fs->fs_journallocs[UFS_WAPBL_EPART_ADDR] = *startp;
+		fs->fs_journallocs[UFS_WAPBL_EPART_COUNT] = *countp;
+		fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ] = *blksizep;
+		fs->fs_journallocs[UFS_WAPBL_EPART_UNUSED] = *extradatap;
+	} else {
+		DPRINTF("end-of-partition has only %" PRId64 " free\n",
+		    logend - logstart);
+
+		location = UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM;
+		*blksizep = secsize;
+
+		error = wapbl_create_infs_log(mp, fs, devvp,
+		                  startp, countp, extradatap);
+		ffs_sync(mp, MNT_WAIT, FSCRED);
+
+		/* convert to physical block numbers */
+		*startp = dbtob(*startp) / secsize;
+		*countp = dbtob(*countp) / secsize;
+
+		fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] = *startp;
+		fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] = *countp;
+		fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ] = *blksizep;
+		fs->fs_journallocs[UFS_WAPBL_INFS_INO] = *extradatap;
+	}
+
+	if (error == 0) {
+		/* update superblock with log location */
+		fs->fs_journal_version = UFS_WAPBL_VERSION;
+		fs->fs_journal_location = location;
+		fs->fs_journal_flags = 0;
+
+		error = ffs_sbupdate(ump, MNT_WAIT);
+	}
+
+	return error;
+}
+
+/*
+ * Try to create a journal log inside the filesystem.
+ */
+static int
+wapbl_create_infs_log(struct mount *mp, struct fs *fs, struct vnode *devvp,
+    daddr_t *startp, size_t *countp, uint64_t *extradatap)
+{
+	struct vnode *vp, *rvp;
+	struct inode *ip;
+	int error;
+
+	if ((error = VFS_ROOT(mp, &rvp)) != 0)
+		return error;
+
+	error = UFS_VALLOC(rvp, 0 | S_IFREG, NOCRED, &vp);
+	if (mp->mnt_flag & MNT_UPDATE) {
+		vput(rvp);
+	} else {
+		VOP_UNLOCK(rvp);
+		vgone(rvp);
+	}
+	if (error != 0)
+		return error;
+
+	vp->v_type = VREG;
+	ip = VTOI(vp);
+	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+	ip->i_mode = 0 | IFREG;
+	DIP_ASSIGN(ip, mode, ip->i_mode);
+	ip->i_flags = SF_LOG;
+	DIP_ASSIGN(ip, flags, ip->i_flags);
+	ip->i_nlink = 1;
+	DIP_ASSIGN(ip, nlink, 1);
+	ffs_update(vp, NULL, NULL, UPDATE_WAIT);
+
+	if ((error = wapbl_allocate_log_file(mp, vp,
+	                 startp, countp, extradatap)) != 0) {
+		/*
+		 * If we couldn't allocate the space for the log file,
+		 * remove the inode by setting its link count back to
+		 * zero and bail.
+		 */
+		ip->i_nlink = 0;
+		DIP_ASSIGN(ip, nlink, 0);
+		VOP_UNLOCK(vp);
+		vgone(vp);
+
+		return error;
+	}
+
+	/*
+	 * Now that we have the place-holder inode for the journal,
+	 * we don't need the vnode ever again.
+	 */
+	VOP_UNLOCK(vp);
+	vgone(vp);
+
+	return 0;
+}
+
+int
+wapbl_allocate_log_file(struct mount *mp, struct vnode *vp,
+    daddr_t *startp, size_t *countp, uint64_t *extradatap)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	daddr_t addr, indir_addr;
+	off_t logsize;
+	size_t size;
+	int error;
+
+	logsize = 0;
+	/* check if there's a suggested log size */
+	if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CREATE_LOG &&
+	    fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM)
+		logsize = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
+
+	if (vp->v_size > 0) {
+		printf("%s: file size (%" PRId64 ") non zero\n", __func__,
+		    vp->v_size);
+		return EEXIST;
+	}
+	wapbl_find_log_start(mp, vp, logsize, &addr, &indir_addr, &size);
+	if (addr == 0) {
+		printf("%s: log not allocated, largest extent is "
+		    "%" PRId64 "MB\n", __func__,
+		    lblktosize(fs, size) / (1024 * 1024));
+		return ENOSPC;
+	}
+
+	logsize = lblktosize(fs, size);	/* final log size */
+
+	VTOI(vp)->i_ffs_first_data_blk = addr;
+	VTOI(vp)->i_ffs_first_indir_blk = indir_addr;
+
+	error = GOP_ALLOC(vp, 0, logsize, B_CONTIG, FSCRED);
+	if (error) {
+		printf("%s: GOP_ALLOC error %d\n", __func__, error);
+		return error;
+	}
+
+	*startp     = fsbtodb(fs, addr);
+	*countp     = btodb(logsize);
+	*extradatap = VTOI(vp)->i_number;
+
+	return 0;
+}
+
+/*
+ * Find a suitable location for the journal in the filesystem.
+ *
+ * Our strategy here is to look for a contiguous block of free space
+ * at least "logfile" MB in size (plus room for any indirect blocks).
+ * We start at the middle of the filesystem and check each cylinder
+ * group working outwards.  If "logfile" MB is not available as a
+ * single contigous chunk, then return the address and size of the
+ * largest chunk found.
+ *
+ * XXX 
+ * At what stage does the search fail?  Is if the largest space we could
+ * find is less than a quarter the requested space reasonable?  If the
+ * search fails entirely, return a block address if "0" it indicate this.
+ */
+static void
+wapbl_find_log_start(struct mount *mp, struct vnode *vp, off_t logsize,
+    daddr_t *addr, daddr_t *indir_addr, size_t *size)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	struct vnode *devvp = ump->um_devvp;
+	struct cg *cgp;
+	struct buf *bp;
+	uint8_t *blksfree;
+	daddr_t blkno, best_addr, start_addr;
+	daddr_t desired_blks, min_desired_blks;
+	daddr_t freeblks, best_blks;
+	int bpcg, cg, error, fixedsize, indir_blks, n, s;
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+	if (logsize == 0) {
+		fixedsize = 0;	/* We can adjust the size if tight */
+		logsize = lfragtosize(fs, fs->fs_dsize) /
+		    UFS_WAPBL_JOURNAL_SCALE;
+		DPRINTF("suggested log size = %" PRId64 "\n", logsize);
+		logsize = max(logsize, UFS_WAPBL_MIN_JOURNAL_SIZE);
+		logsize = min(logsize, UFS_WAPBL_MAX_JOURNAL_SIZE);
+		DPRINTF("adjusted log size = %" PRId64 "\n", logsize);
+	} else {
+		fixedsize = 1;
+		DPRINTF("fixed log size = %" PRId64 "\n", logsize);
+	}
+
+	desired_blks = logsize / fs->fs_bsize;
+	DPRINTF("desired blocks = %" PRId64 "\n", desired_blks);
+
+	/* add in number of indirect blocks needed */
+	indir_blks = 0;
+	if (desired_blks >= NDADDR) {
+		struct indir indirs[NIADDR + 2];
+		int num;
+
+		error = ufs_getlbns(vp, desired_blks, indirs, &num);
+		if (error) {
+			printf("%s: ufs_getlbns failed, error %d!\n",
+			    __func__, error);
+			goto bad;
+		}
+
+		switch (num) {
+		case 2:
+			indir_blks = 1;		/* 1st level indirect */
+			break;
+		case 3:
+			indir_blks = 1 +	/* 1st level indirect */
+			    1 +			/* 2nd level indirect */
+			    indirs[1].in_off + 1; /* extra 1st level indirect */
+			break;
+		default:
+			printf("%s: unexpected numlevels %d from ufs_getlbns\n",
+			    __func__, num);
+			*size = 0;
+			goto bad;
+		}
+		desired_blks += indir_blks;
+	}
+	DPRINTF("desired blocks = %" PRId64 " (including indirect)\n",
+	    desired_blks);
+
+	/*
+	 * If a specific size wasn't requested, allow for a smaller log
+	 * if we're really tight for space...
+	 */
+	min_desired_blks = desired_blks;
+	if (!fixedsize)
+		min_desired_blks = desired_blks / 4;
+
+	/* Look at number of blocks per CG.  If it's too small, bail early. */
+	bpcg = fragstoblks(fs, fs->fs_fpg);
+	if (min_desired_blks > bpcg) {
+		printf("ffs_wapbl: cylinder group size of %" PRId64 " MB "
+		    " is not big enough for journal\n",
+		    lblktosize(fs, bpcg) / (1024 * 1024));
+		goto bad;
+	}
+
+	/*
+	 * Start with the middle cylinder group, and search outwards in
+	 * both directions until we either find the requested log size
+	 * or reach the start/end of the file system.  If we reach the
+	 * start/end without finding enough space for the full requested
+	 * log size, use the largest extent found if it is large enough
+	 * to satisfy the our minimum size.
+	 *
+	 * XXX
+	 * Can we just use the cluster contigsum stuff (esp on UFS2)
+	 * here to simplify this search code?
+	 */
+	best_addr = 0;
+	best_blks = 0;
+	for (cg = fs->fs_ncg / 2, s = 0, n = 1;
+	    best_blks < desired_blks && cg >= 0 && cg < fs->fs_ncg;
+	    s++, n = -n, cg += n * s) {
+		DPRINTF("check cg %d of %d\n", cg, fs->fs_ncg);
+		error = bread(devvp, fsbtodb(fs, cgtod(fs, cg)),
+		    fs->fs_cgsize, FSCRED, 0, &bp);
+		cgp = (struct cg *)bp->b_data;
+		if (error || !cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
+			brelse(bp, 0);
+			continue;
+		}
+
+		blksfree = cg_blksfree(cgp, needswap);
+
+		for (blkno = 0; blkno < bpcg;) {
+			/* look for next free block */
+			/* XXX use scanc() and fragtbl[] here? */
+			for (; blkno < bpcg - min_desired_blks; blkno++)
+				if (ffs_isblock(fs, blksfree, blkno))
+					break;
+
+			/* past end of search space in this CG? */
+			if (blkno >= bpcg - min_desired_blks)
+				break;
+
+			/* count how many free blocks in this extent */
+			start_addr = blkno;
+			for (freeblks = 0; blkno < bpcg; blkno++, freeblks++)
+				if (!ffs_isblock(fs, blksfree, blkno))
+					break;
+
+			if (freeblks > best_blks) {
+				best_blks = freeblks;
+				best_addr = blkstofrags(fs, start_addr) +
+				    cgbase(fs, cg);
+
+				if (freeblks >= desired_blks) {
+					DPRINTF("found len %" PRId64
+					    " at offset %" PRId64 " in gc\n",
+					    freeblks, start_addr);
+					break;
+				}
+			}
+		}
+		brelse(bp, 0);
+	}
+	DPRINTF("best found len = %" PRId64 ", wanted %" PRId64
+	    " at addr %" PRId64 "\n", best_blks, desired_blks, best_addr);
+
+	if (best_blks < min_desired_blks) {
+		*addr = 0;
+		*indir_addr = 0;
+	} else {
+		/* put indirect blocks at start, and data blocks after */
+		*addr = best_addr + blkstofrags(fs, indir_blks);
+		*indir_addr = best_addr;
+	}
+	*size = min(desired_blks, best_blks) - indir_blks;
+	return;
+
+bad:
+	*addr = 0;
+	*indir_addr = 0;
+	*size = 0;
+	return;
+}
diff --git a/include/ufs/ffs/fs.h b/sys/ufs/ffs/fs.h
similarity index 100%
rename from include/ufs/ffs/fs.h
rename to sys/ufs/ffs/fs.h
diff --git a/sys/ufs/files.ufs b/sys/ufs/files.ufs
new file mode 100644
index 000000000..7bd59a003
--- /dev/null
+++ b/sys/ufs/files.ufs
@@ -0,0 +1,89 @@
+#	$NetBSD: files.ufs,v 1.27 2011/11/24 15:51:31 ahoka Exp $
+
+deffs					FFS
+deffs					EXT2FS
+deffs					MFS
+deffs					LFS
+deffs					CHFS
+
+defflag	opt_ffs.h			FFS_EI FFS_NO_SNAPSHOT APPLE_UFS
+					UFS_DIRHASH
+					UFS_EXTATTR UFS_EXTATTR_AUTOSTART
+
+defflag	opt_lfs.h			LFS_KERNEL_RFW
+
+file	ufs/ext2fs/ext2fs_alloc.c	ext2fs
+file	ufs/ext2fs/ext2fs_balloc.c	ext2fs
+file	ufs/ext2fs/ext2fs_bmap.c	ext2fs
+file	ufs/ext2fs/ext2fs_bswap.c	ext2fs
+file	ufs/ext2fs/ext2fs_inode.c	ext2fs
+file	ufs/ext2fs/ext2fs_lookup.c	ext2fs
+file	ufs/ext2fs/ext2fs_readwrite.c	ext2fs
+file	ufs/ext2fs/ext2fs_subr.c	ext2fs
+file	ufs/ext2fs/ext2fs_vfsops.c	ext2fs
+file	ufs/ext2fs/ext2fs_vnops.c	ext2fs
+
+file	ufs/chfs/ebh.c			chfs
+file	ufs/chfs/chfs_ihash.c		chfs
+file	ufs/chfs/chfs_scan.c		chfs
+file	ufs/chfs/chfs_write.c		chfs
+file	ufs/chfs/chfs_vnode_cache.c	chfs
+file	ufs/chfs/chfs_erase.c		chfs
+file	ufs/chfs/chfs_build.c		chfs
+file	ufs/chfs/chfs_wbuf.c		chfs
+file	ufs/chfs/chfs_vnops.c		chfs
+file	ufs/chfs/chfs_gc.c		chfs
+file	ufs/chfs/chfs_nodeops.c		chfs
+file	ufs/chfs/chfs_malloc.c		chfs
+file	ufs/chfs/chfs_pool.c		chfs
+file	ufs/chfs/debug.c			chfs
+file	ufs/chfs/chfs_vnode.c		chfs
+file	ufs/chfs/chfs_subr.c		chfs
+file	ufs/chfs/chfs_vfsops.c		chfs
+file	ufs/chfs/chfs_readinode.c	chfs
+
+file	ufs/ffs/ffs_alloc.c		ffs | lfs | mfs | ext2fs | chfs
+file	ufs/ffs/ffs_balloc.c		ffs | lfs | mfs | ext2fs | chfs
+file	ufs/ffs/ffs_bswap.c		(ffs | mfs) & ffs_ei
+file	ufs/ffs/ffs_inode.c		ffs | lfs | mfs | ext2fs | chfs
+file	ufs/ffs/ffs_snapshot.c		ffs | lfs | mfs | ext2fs | chfs
+file	ufs/ffs/ffs_subr.c		ffs | lfs | mfs | ext2fs | chfs
+file	ufs/ffs/ffs_tables.c		ffs | lfs | mfs | ext2fs | chfs
+file	ufs/ffs/ffs_vfsops.c		ffs | lfs | mfs | ext2fs | chfs
+file	ufs/ffs/ffs_vnops.c		ffs | lfs | mfs | ext2fs | chfs
+file	ufs/ffs/ffs_wapbl.c		ffs & wapbl
+file	ufs/ffs/ffs_appleufs.c		ffs & apple_ufs
+file	ufs/ffs/ffs_quota2.c		quota2 & (ffs | lfs | mfs | ext2fs | chfs)
+
+file	ufs/lfs/lfs_alloc.c		lfs
+file	ufs/lfs/lfs_balloc.c		lfs
+file	ufs/lfs/lfs_bio.c		lfs
+file	ufs/lfs/lfs_cksum.c		lfs
+file	ufs/lfs/lfs_debug.c		lfs
+file	ufs/lfs/lfs_inode.c		lfs
+file	ufs/lfs/lfs_itimes.c		lfs
+file	ufs/lfs/lfs_rfw.c		lfs & lfs_kernel_rfw
+file	ufs/lfs/lfs_segment.c		lfs
+file	ufs/lfs/lfs_subr.c		lfs
+file	ufs/lfs/lfs_syscalls.c		lfs
+file	ufs/lfs/lfs_vfsops.c		lfs
+file	ufs/lfs/lfs_vnops.c		lfs
+
+file	ufs/mfs/mfs_vfsops.c		mfs
+file	ufs/mfs/mfs_vnops.c		mfs
+file	ufs/mfs/mfs_miniroot.c
+
+file	ufs/ufs/ufs_bmap.c		ffs | lfs | mfs | ext2fs | chfs
+file	ufs/ufs/ufs_dirhash.c		(ffs | lfs | mfs | ext2fs | chfs) & ufs_dirhash
+file	ufs/ufs/ufs_extattr.c		(ffs | mfs) & ufs_extattr
+file	ufs/ufs/ufs_ihash.c		ffs | lfs | mfs | ext2fs
+file	ufs/ufs/ufs_inode.c		ffs | lfs | mfs | ext2fs
+file	ufs/ufs/ufs_lookup.c		ffs | lfs | mfs | ext2fs | chfs
+file	ufs/ufs/ufs_quota.c		(quota | quota2) & (ffs | lfs | mfs | ext2fs | chfs)
+file	ufs/ufs/ufs_quota1.c		quota & (ffs | lfs | mfs | ext2fs | chfs)
+file	ufs/ufs/ufs_quota2.c		quota2 & (ffs | lfs | mfs | ext2fs | chfs)
+file	ufs/ufs/quota1_subr.c
+file	ufs/ufs/quota2_subr.c		quota2 & (ffs | lfs | mfs | ext2fs | chfs)
+file	ufs/ufs/ufs_vfsops.c		ffs | lfs | mfs | ext2fs | chfs
+file	ufs/ufs/ufs_vnops.c		ffs | lfs | mfs | ext2fs | chfs
+file	ufs/ufs/ufs_wapbl.c		ffs & wapbl
diff --git a/sys/ufs/lfs/CHANGES b/sys/ufs/lfs/CHANGES
new file mode 100644
index 000000000..dfad48551
--- /dev/null
+++ b/sys/ufs/lfs/CHANGES
@@ -0,0 +1,169 @@
+#   $NetBSD: CHANGES,v 1.5 2005/12/11 12:25:26 christos Exp $
+
+kernel:
+
+- Instead of blindly continuing when it encounters an Inode that is
+  locked by another process, lfs_markv will process the rest of the
+  inodes passed to it and then return EAGAIN.  The cleaner will
+  recognize this and not mark the segment clean.  When the cleaner runs
+  again, the segment containg the (formerly) locked inode will sort high
+  for cleaning, since it is now almost entirely empty.
+
+- A beginning has been made to test keeping atime information in the
+  Ifile, instead of on the inodes.  This should make read-mostly
+  filesystems significantly faster, since the inodes will then remain
+  close to the data blocks on disk; but of course the ifile will be
+  somewhat larger.  This code is not enabled, as it makes the format of
+  IFILEs change.
+
+- The superblock has been broken into two components: an on-disk
+  superblock using fixed-size types, exactly 512 bytes regardless of
+  architecture (or could be enlarged in multiples of the media block
+  size up to LFS_SBPAD); and an in-memory superblock containing the
+  information only useful to a running LFS, including segment pointers,
+  etc.  The superblock checksumming code has been modified to make
+  future changes to the superblock format easier.
+
+- Because of the way that lfs_writeseg works, buffers are freed before
+  they are really written to disk: their contents are copied into large
+  buffers which are written async.  Because the buffer cache does not
+  serve to throttle these writes, and malloced memory is used to hold them,
+  there is a danger of running out of kmem_map.  To avoid this, a new
+  compile-time parameter, LFS_THROTTLE, is used as an upper bound for the
+  number of partial-segments allowed to be in progress writing at any
+  given time.
+
+- If the system crashes between the point that a checkpoint is scheduled
+  for writing and the time that the write completes, the filesystem
+  could be left in an inconsistent state (no valid checkpoints on
+  disk).  To avoid this, we toggle between the first two superblocks
+  when checkpointing, and (if it is indicated that no roll-forward agent
+  exists) do not allow one checkpoint to occur before the last one has
+  completed.  When the filesystem is mounted, it uses the *older* of the
+  first two superblocks.
+
+- DIROPs:
+
+  The design of the LFS includes segregating vnodes used in directory
+  operations, so that they can be written at the same time during a
+  checkpoint, avoiding filesystem inconsistency after a crash.  Code for
+  this was partially written for BSD4.4, but was not complete or enabled.
+
+  In particular, vnodes marked VDIROP could be flushed by getnewvnode at
+  any time, negating the usefulness of marking a vnode VDIROP, since if
+  the filesystem then crashed it would be inconsistent.  Now, when a
+  vnode is first marked VDIROP it is also referenced.  To avoid running
+  out of vnodes, an attempt to mark more than LFS_MAXDIROP vnodes wth
+  VDIROP will sleep, and trigger a partial-segment write when no dirops
+  are active.
+
+- LFS maintains a linked list of free inode numbers in the Ifile;
+  accesses to this list are now protected by a simple lock.
+
+- lfs_vfree is not allowed to run while an inode has blocks scheduled
+  for writing, since that could trigger a miscounting in lfs_truncate.
+
+- lfs_balloc now correctly extends fragments, if a block is written
+  beyond the current end-of-file.
+
+- Blocks which have already been gathered into a partial-segment are not
+  allowed to be extended, since if they were, any blocks following them
+  would either be written in the wrong place, or overwrite other blocks.
+
+- The LFS buffer-header accounting, which triggers a partial-segment
+  write if too many buffer-headers are in use by the LFS subystem, has
+  been expanded to include *bytes* used in LFS buffers as well.
+
+- Reads of the Ifile, which almost always come from the cleaner, can no
+  longer trigger a partial-segment write, since this could cause a
+  deadlock.
+
+- Support has been added (but not tested, and currently disabled by
+  default) for true read-only filesystems.  Currently, if a filesystem
+  is mounted read-only the cleaner can still operate on it, but this
+  obviously would not be true for read-only media.  (I think the
+  original plan was for the roll-forward agent to operate using this
+  "feature"?)
+
+- If a fake buffer is created by lfs_markv and another process draws the
+  same block in and changes it, the fake buffer is now discarded and
+  replaced by the "real" buffer containing the new data.
+
+- An inode which has blocks gathered no longer has IN_MODIFIED set, but
+  still does in fact have dirty blocks attached.  lfs_update will now
+  wait for such an inode's writes to complete before it runs,
+  suppressing a panic in vinvalbuf.
+
+- Many filesystem operations now update the Ifile's mtime, allowing the
+  cleaner to detect when the filesystem is idle, and clean more
+  vigorously during such times (cf. Blackwell et al., 1995).
+
+- When writing a partial-segment, make sure that the current segment is
+  still marked ACTIVE afterward (otherwise the cleaner might try to
+  clean it, since it might well be mostly empty).
+
+- Don't trust the cleaner so much.  Sort the blocks during gathering,
+  even if they came from the cleaner; verify the location of on-disk
+  inodes, even if the cleaner says it knows where they came from.
+
+- The cleaning code (lfs_markv in particular) has been entirely
+  rewritten, and the partial-segment writing code changed to match.
+  Lfs_markv no longer uses its own implementation of lfs_segwrite, but
+  marks inodes with IN_CLEANING to differentiate them from the
+  non-cleaning inodes.  This change fixes numerous problems with the old
+  cleaner, including a buffer overrun, and lost extensions in active
+  fragments.  lfs_bmapv looks up and returns the addresses of inode
+  blocks, so the cleaner can do something intelligent with them.
+
+  If IN_CLEANING is set on an inode during partial-segment write, only fake
+  buffers will be written, and IN_MODIFIED will not be cleared, saving
+  us from a panic in vinvalbuf.  The addition of IN_CLEANING also allows
+  dirops to be active while cleaning is in progress; since otherwise
+  buffers engaged in active dirops might be written ahead of schedule,
+  and cause an inconsistent checkpoint to be written to disk.
+
+  (XXX - even now, DIROP blocks can sometimes be written to disk, if we
+  are cleaning the same blocks as are active?  Grr, I don't see a good
+  solution for this!)
+
+- Added sysctl entries for LFS.  In particular, `writeindir' controls
+  whether indirect blocks are written during non-checkpoint writes.
+  (Since there is no roll-forward agent as yet, there is no penalty in
+  not writing indirect blocks.)
+
+- Wake up the cleaner at fs-unmount time, so it can die (if we unmount
+  and then remount, we could conceivably get more than one cleaner
+  operating at once).
+
+newfs_lfs:
+
+- The ifile inode is now created with the schg flag set, since nothing
+  ever modifies it.  This could be a pain for the roll-forward agent,
+  but since that should really run *before* the filesystem is mounted,
+  I don't care.
+
+- For large disks, it may be necessary to write one or more indirect
+  blocks when the ifile inode is created.  Newlfs has been changed to
+  write the first indirect block, if necessary.  It should instead just
+  build a set of inodes and blocks, and then use the partial-segment
+  writing routine mentioned above to write an ifile of whatever size is
+  desired.
+
+lfs_cleanerd:
+
+- Now writes information to the syslog.
+
+- Can now deal properly with fragments.
+
+- Sometimes, the cleaner can die.  (Why?)  If this happens and we don't
+  notice, we're screwed, since the fs will overfill.  So, the invoked
+  cleaner now spawns itself repeatedly, a la init(8), to ensure that a
+  cleaner is always present to clean the fs.
+
+- Added a flag to clean more actively, not on low load average but
+  filesystem inactivity; a la Blackwell et al., 1995.
+
+fsck_lfs:
+
+- Exists, although it currently cannot actually fix anything (it is a
+  diagnostic tool only at this point).
diff --git a/sys/ufs/lfs/Makefile b/sys/ufs/lfs/Makefile
new file mode 100644
index 000000000..bb61c7b44
--- /dev/null
+++ b/sys/ufs/lfs/Makefile
@@ -0,0 +1,7 @@
+#	$NetBSD: Makefile,v 1.1 1998/06/12 23:23:12 cgd Exp $
+
+INCSDIR= /usr/include/ufs/lfs
+
+INCS=	lfs.h lfs_extern.h
+
+.include <bsd.kinc.mk>
diff --git a/sys/ufs/lfs/README b/sys/ufs/lfs/README
new file mode 100644
index 000000000..827edbf92
--- /dev/null
+++ b/sys/ufs/lfs/README
@@ -0,0 +1,137 @@
+#	$NetBSD: README,v 1.3 1999/03/15 00:46:47 perseant Exp $
+
+#	@(#)README	8.1 (Berkeley) 6/11/93
+
+The file system is reasonably stable...I think.
+
+For details on the implementation, performance and why garbage
+collection always wins, see Dr. Margo Seltzer's thesis available for
+anonymous ftp from toe.cs.berkeley.edu, in the directory
+pub/personal/margo/thesis.ps.Z, or the January 1993 USENIX paper.
+
+----------
+The disk is laid out in segments.  The first segment starts 8K into the
+disk (the first 8K is used for boot information).  Each segment is composed
+of the following:
+
+	An optional super block
+	One or more groups of:
+		segment summary
+		0 or more data blocks
+		0 or more inode blocks
+
+The segment summary and inode/data blocks start after the super block (if
+present), and grow toward the end of the segment.
+
+	_______________________________________________
+	|         |            |         |            |
+	| summary | data/inode | summary | data/inode |
+	|  block  |   blocks   |  block  |   blocks   | ...
+	|_________|____________|_________|____________|
+
+The data/inode blocks following a summary block are described by the
+summary block.  In order to permit the segment to be written in any order
+and in a forward direction only, a checksum is calculated across the
+blocks described by the summary.  Additionally, the summary is checksummed
+and timestamped.  Both of these are intended for recovery; the former is
+to make it easy to determine that it *is* a summary block and the latter
+is to make it easy to determine when recovery is finished for partially
+written segments.  These checksums are also used by the cleaner.
+
+	Summary block (detail)
+	________________
+	| sum cksum    |
+	| data cksum   |
+	| next segment |
+	| timestamp    |
+	| FINFO count  |
+	| inode count  |
+	| flags        |
+	|______________|
+	|   FINFO-1    | 0 or more file info structures, identifying the
+	|     .        | blocks in the segment.
+	|     .        |
+	|     .        |
+	|   FINFO-N    |
+	|   inode-N    |
+	|     .        |
+	|     .        |
+	|     .        | 0 or more inode daddr_t's, identifying the inode
+	|   inode-1    | blocks in the segment.
+	|______________|
+
+Inode blocks are blocks of on-disk inodes in the same format as those in
+the FFS.  However, spare[0] contains the inode number of the inode so we
+can find a particular inode on a page.  They are packed page_size /
+sizeof(inode) to a block.  Data blocks are exactly as in the FFS.  Both
+inodes and data blocks move around the file system at will.
+
+The file system is described by a super-block which is replicated and
+occurs as the first block of the first and other segments.  (The maximum
+number of super-blocks is MAXNUMSB).  Each super-block maintains a list
+of the disk addresses of all the super-blocks.  The super-block maintains
+a small amount of checkpoint information, essentially just enough to find
+the inode for the IFILE (fs->lfs_idaddr).
+
+The IFILE is visible in the file system, as inode number IFILE_INUM.  It
+contains information shared between the kernel and various user processes.
+
+	Ifile (detail)
+	________________
+	| cleaner info | Cleaner information per file system.  (Page
+	|              | granularity.)
+	|______________|
+	| segment      | Space available and last modified times per
+	| usage table  | segment.  (Page granularity.)
+	|______________|
+	|   IFILE-1    | Per inode status information: current version #,
+	|     .        | if currently allocated, last access time and
+	|     .        | current disk address of containing inode block.
+	|     .        | If current disk address is LFS_UNUSED_DADDR, the
+	|   IFILE-N    | inode is not in use, and it's on the free list.
+	|______________|
+
+
+First Segment at Creation Time:
+_____________________________________________________________
+|        |       |         |       |       |       |       |
+| 8K pad | Super | summary | inode | ifile | root  | l + f |
+|        | block |         | block |       | dir   | dir   |
+|________|_______|_________|_______|_______|_______|_______|
+	  ^
+           Segment starts here.
+
+Some differences from the Sprite LFS implementation.
+
+1. The LFS implementation placed the ifile metadata and the super block
+   at fixed locations.  This implementation replicates the super block
+   and puts each at a fixed location.  The checkpoint data is divided into
+   two parts -- just enough information to find the IFILE is stored in
+   two of the super blocks, although it is not toggled between them as in
+   the Sprite implementation.  (This was deliberate, to avoid a single
+   point of failure.)  The remaining checkpoint information is treated as
+   a regular file, which means that the cleaner info, the segment usage
+   table and the ifile meta-data are stored in normal log segments.
+   (Tastes great, less filling...)
+
+2. The segment layout is radically different in Sprite; this implementation
+   uses something a lot like network framing, where data/inode blocks are
+   written asynchronously, and a checksum is used to validate any set of
+   summary and data/inode blocks.  Sprite writes summary blocks synchronously
+   after the data/inode blocks have been written and the existence of the
+   summary block validates the data/inode blocks.  This permits us to write
+   everything contiguously, even partial segments and their summaries, whereas
+   Sprite is forced to seek (from the end of the data inode to the summary
+   which lives at the end of the segment).  Additionally, writing the summary
+   synchronously should cost about 1/2 a rotation per summary.
+
+3. Sprite LFS distinguishes between different types of blocks in the segment.
+   Other than inode blocks and data blocks, we don't.
+
+4. Sprite LFS traverses the IFILE looking for free blocks.  We maintain a
+   free list threaded through the IFILE entries.
+
+5. The cleaner runs in user space, as opposed to kernel space.  It shares
+   information with the kernel by reading/writing the IFILE and through
+   cleaner specific system calls.
+
diff --git a/sys/ufs/lfs/TODO b/sys/ufs/lfs/TODO
new file mode 100644
index 000000000..e86ecdb03
--- /dev/null
+++ b/sys/ufs/lfs/TODO
@@ -0,0 +1,109 @@
+#   $NetBSD: TODO,v 1.10 2005/12/11 12:25:26 christos Exp $
+
+- Lock audit.  Need to check locking for multiprocessor case in particular.
+
+- Get rid of lfs_segclean(); the kernel should clean a dirty segment IFF it
+  has passed two checkpoints containing zero live bytes.
+
+- Now that our cache is basically all of physical memory, we need to make
+  sure that segwrite is not starving other important things.  Need a way
+  to prioritize which blocks are most important to write, and write only
+  those, saving the rest for later.  Does this change our notion of what
+  a checkpoint is?
+
+- Investigate alternate inode locking strategy: Inode locks are useful
+  for locking against simultaneous changes to inode size (balloc,
+  truncate, write) but because the assignment of disk blocks is also
+  covered by the segment lock, we don't really need to pay attention to
+  the inode lock when writing a segment, right?  If this is true, the
+  locking problem in lfs_{bmapv,markv} goes away and lfs_reserve can go,
+  too.
+
+- Get rid of DEV_BSIZE, pay attention to the media block size at mount time.
+
+- More fs ops need to call lfs_imtime.  Which ones?  (Blackwell et al., 1995)
+
+- lfs_vunref_head exists so that vnodes loaded solely for cleaning can
+  be put back on the *head* of the vnode free list.  Make sure we
+  actually do this, since we now take IN_CLEANING off during segment write.
+
+- The cleaner could be enhanced to be controlled from other processes,
+  and possibly perform additional tasks:
+
+  - Backups.  At a minimum, turn the cleaner off and on to allow
+    effective live backups.  More aggressively, the cleaner itself could
+    be the backup agent, and dump_lfs would merely be a controller.
+
+  - Cleaning time policies.  Be able to tweak the cleaner's thresholds
+    to allow more thorough cleaning during policy-determined idle
+    periods (regardless of actual idleness) or put off until later
+    during short, intensive write periods.
+
+  - File coalescing and placement.  During periods we expect to be idle,
+    coalesce fragmented files into one place on disk for better read
+    performance.  Ideally, move files that have not been accessed in a
+    while to the extremes of the disk, thereby shortening seek times for
+    files that are accessed more frequently (though how the cleaner
+    should communicate "please put this near the beginning or end of the
+    disk" to the kernel is a very good question; flags to lfs_markv?).
+
+  - Versioning.  When it cleans a segment it could write data for files
+    that were less than n versions old to tape or elsewhere.  Perhaps it
+    could even write them back onto the disk, although that requires
+    more thought (and kernel mods).
+
+- Move lfs_countlocked() into vfs_bio.c, to replace count_locked_queue;
+  perhaps keep the name, replace the function.  Could it count referenced
+  vnodes as well, if it was in vfs_subr.c instead?
+
+- Why not delete the lfs_bmapv call, just mark everything dirty that
+  isn't deleted/truncated?  Get some numbers about what percentage of
+  the stuff that the cleaner thinks might be live is live.  If it's
+  high, get rid of lfs_bmapv.
+
+- There is a nasty problem in that it may take *more* room to write the
+  data to clean a segment than is returned by the new segment because of
+  indirect blocks in segment 2 being dirtied by the data being copied
+  into the log from segment 1.  The suggested solution at this point is
+  to detect it when we have no space left on the filesystem, write the
+  extra data into the last segment (leaving no clean ones), make it a
+  checkpoint and shut down the file system for fixing by a utility
+  reading the raw partition.  Argument is that this should never happen
+  and is practically impossible to fix since the cleaner would have to
+  theoretically build a model of the entire filesystem in memory to
+  detect the condition occurring.  A file coalescing cleaner will help
+  avoid the problem, and one that reads/writes from the raw disk could
+  fix it.
+
+- Need to keep vnode v_numoutput up to date for pending writes?
+
+- If delete a file that's being executed, the version number isn't
+  updated, and fsck_lfs has to figure this out; case is the same as if
+  have an inode that no directory references, so the file should be
+  reattached into lost+found.
+
+- Currently there's no notion of write error checking.
+  + Failed data/inode writes should be rescheduled (kernel level bad blocking).
+  + Failed superblock writes should cause selection of new superblock
+  for checkpointing.
+
+- Future fantasies:
+  - unrm, versioning
+  - transactions
+  - extended cleaner policies (hot/cold data, data placement)
+
+- Problem with the concept of multiple buffer headers referencing the segment:
+  Positives:
+    Don't lock down 1 segment per file system of physical memory.
+    Don't copy from buffers to segment memory.
+    Don't tie down the bus to transfer 1M.
+    Works on controllers supporting less than large transfers.
+    Disk can start writing immediately instead of waiting 1/2 rotation
+        and the full transfer.
+  Negatives:
+    Have to do segment write then segment summary write, since the latter
+    is what verifies that the segment is okay.  (Is there another way
+    to do this?)
+
+- The algorithm for selecting the disk addresses of the super-blocks
+  has to be available to the user program which checks the file system.
diff --git a/include/ufs/lfs/lfs.h b/sys/ufs/lfs/lfs.h
similarity index 100%
rename from include/ufs/lfs/lfs.h
rename to sys/ufs/lfs/lfs.h
diff --git a/sys/ufs/lfs/lfs_alloc.c b/sys/ufs/lfs/lfs_alloc.c
new file mode 100644
index 000000000..8d2baa01d
--- /dev/null
+++ b/sys/ufs/lfs/lfs_alloc.c
@@ -0,0 +1,674 @@
+/*	$NetBSD: lfs_alloc.c,v 1.111 2011/06/12 03:36:01 rmind Exp $	*/
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)lfs_alloc.c	8.4 (Berkeley) 1/4/94
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.111 2011/06/12 03:36:01 rmind Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/buf.h>
+#include <sys/lock.h>
+#include <sys/vnode.h>
+#include <sys/syslog.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/proc.h>
+#include <sys/tree.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+/* Constants for inode free bitmap */
+#define BMSHIFT 5	/* 2 ** 5 = 32 */
+#define BMMASK  ((1 << BMSHIFT) - 1)
+#define SET_BITMAP_FREE(F, I) do { \
+	DLOG((DLOG_ALLOC, "lfs: ino %d wrd %d bit %d set\n", (int)(I), 	\
+	     (int)((I) >> BMSHIFT), (int)((I) & BMMASK)));		\
+	(F)->lfs_ino_bitmap[(I) >> BMSHIFT] |= (1 << ((I) & BMMASK));	\
+} while (0)
+#define CLR_BITMAP_FREE(F, I) do { \
+	DLOG((DLOG_ALLOC, "lfs: ino %d wrd %d bit %d clr\n", (int)(I), 	\
+	     (int)((I) >> BMSHIFT), (int)((I) & BMMASK)));		\
+	(F)->lfs_ino_bitmap[(I) >> BMSHIFT] &= ~(1 << ((I) & BMMASK));	\
+} while(0)
+
+#define ISSET_BITMAP_FREE(F, I) \
+	((F)->lfs_ino_bitmap[(I) >> BMSHIFT] & (1 << ((I) & BMMASK)))
+
+/*
+ * Add a new block to the Ifile, to accommodate future file creations.
+ * Called with the segment lock held.
+ */
+int
+lfs_extend_ifile(struct lfs *fs, kauth_cred_t cred)
+{
+	struct vnode *vp;
+	struct inode *ip;
+	IFILE *ifp;
+	IFILE_V1 *ifp_v1;
+	struct buf *bp, *cbp;
+	int error;
+	daddr_t i, blkno, xmax;
+	ino_t oldlast, maxino;
+	CLEANERINFO *cip;
+
+	ASSERT_SEGLOCK(fs);
+
+	vp = fs->lfs_ivnode;
+	ip = VTOI(vp);
+	blkno = lblkno(fs, ip->i_size);
+	if ((error = lfs_balloc(vp, ip->i_size, fs->lfs_bsize, cred, 0,
+				&bp)) != 0) {
+		return (error);
+	}
+	ip->i_size += fs->lfs_bsize;
+	ip->i_ffs1_size = ip->i_size;
+	uvm_vnp_setsize(vp, ip->i_size);
+
+	maxino = ((ip->i_size >> fs->lfs_bshift) - fs->lfs_cleansz -
+		  fs->lfs_segtabsz) * fs->lfs_ifpb;
+	fs->lfs_ino_bitmap = (lfs_bm_t *)
+		realloc(fs->lfs_ino_bitmap, ((maxino + BMMASK) >> BMSHIFT) *
+			sizeof(lfs_bm_t), M_SEGMENT, M_WAITOK);
+	KASSERT(fs->lfs_ino_bitmap != NULL);
+
+	i = (blkno - fs->lfs_segtabsz - fs->lfs_cleansz) *
+		fs->lfs_ifpb;
+
+	/*
+	 * We insert the new inodes at the head of the free list.
+	 * Under normal circumstances, the free list is empty here,
+	 * so we are also incidentally placing them at the end (which
+	 * we must do if we are to keep them in order).
+	 */
+	LFS_GET_HEADFREE(fs, cip, cbp, &oldlast);
+	LFS_PUT_HEADFREE(fs, cip, cbp, i);
+#ifdef DIAGNOSTIC
+	if (fs->lfs_freehd == LFS_UNUSED_INUM)
+		panic("inode 0 allocated [2]");
+#endif /* DIAGNOSTIC */
+	xmax = i + fs->lfs_ifpb;
+
+	if (fs->lfs_version == 1) {
+		for (ifp_v1 = (IFILE_V1 *)bp->b_data; i < xmax; ++ifp_v1) {
+			SET_BITMAP_FREE(fs, i);
+			ifp_v1->if_version = 1;
+			ifp_v1->if_daddr = LFS_UNUSED_DADDR;
+			ifp_v1->if_nextfree = ++i;
+		}
+		ifp_v1--;
+		ifp_v1->if_nextfree = oldlast;
+	} else {
+		for (ifp = (IFILE *)bp->b_data; i < xmax; ++ifp) {
+			SET_BITMAP_FREE(fs, i);
+			ifp->if_version = 1;
+			ifp->if_daddr = LFS_UNUSED_DADDR;
+			ifp->if_nextfree = ++i;
+		}
+		ifp--;
+		ifp->if_nextfree = oldlast;
+	}
+	LFS_PUT_TAILFREE(fs, cip, cbp, xmax - 1);
+
+	(void) LFS_BWRITE_LOG(bp); /* Ifile */
+
+	return 0;
+}
+
+/* Allocate a new inode. */
+/* ARGSUSED */
+/* VOP_BWRITE 2i times */
+int
+lfs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
+    struct vnode **vpp)
+{
+	struct lfs *fs;
+	struct buf *bp, *cbp;
+	struct ifile *ifp;
+	ino_t new_ino;
+	int error;
+	int new_gen;
+	CLEANERINFO *cip;
+
+	fs = VTOI(pvp)->i_lfs;
+	if (fs->lfs_ronly)
+		return EROFS;
+
+	ASSERT_NO_SEGLOCK(fs);
+
+	lfs_seglock(fs, SEGM_PROT);
+	vn_lock(fs->lfs_ivnode, LK_EXCLUSIVE);
+
+	/* Get the head of the freelist. */
+	LFS_GET_HEADFREE(fs, cip, cbp, &new_ino);
+	KASSERT(new_ino != LFS_UNUSED_INUM && new_ino != LFS_IFILE_INUM);
+
+	DLOG((DLOG_ALLOC, "lfs_valloc: allocate inode %lld\n",
+	     (long long)new_ino));
+
+	/*
+	 * Remove the inode from the free list and write the new start
+	 * of the free list into the superblock.
+	 */
+	CLR_BITMAP_FREE(fs, new_ino);
+	LFS_IENTRY(ifp, fs, new_ino, bp);
+	if (ifp->if_daddr != LFS_UNUSED_DADDR)
+		panic("lfs_valloc: inuse inode %llu on the free list",
+		    (unsigned long long)new_ino);
+	LFS_PUT_HEADFREE(fs, cip, cbp, ifp->if_nextfree);
+	DLOG((DLOG_ALLOC, "lfs_valloc: headfree %lld -> %lld\n",
+	     (long long)new_ino, (long long)ifp->if_nextfree));
+
+	new_gen = ifp->if_version; /* version was updated by vfree */
+	brelse(bp, 0);
+
+	/* Extend IFILE so that the next lfs_valloc will succeed. */
+	if (fs->lfs_freehd == LFS_UNUSED_INUM) {
+		if ((error = lfs_extend_ifile(fs, cred)) != 0) {
+			LFS_PUT_HEADFREE(fs, cip, cbp, new_ino);
+			VOP_UNLOCK(fs->lfs_ivnode);
+			lfs_segunlock(fs);
+			return error;
+		}
+	}
+#ifdef DIAGNOSTIC
+	if (fs->lfs_freehd == LFS_UNUSED_INUM)
+		panic("inode 0 allocated [3]");
+#endif /* DIAGNOSTIC */
+
+	/* Set superblock modified bit and increment file count. */
+	mutex_enter(&lfs_lock);
+	fs->lfs_fmod = 1;
+	mutex_exit(&lfs_lock);
+	++fs->lfs_nfiles;
+
+	VOP_UNLOCK(fs->lfs_ivnode);
+	lfs_segunlock(fs);
+
+	return lfs_ialloc(fs, pvp, new_ino, new_gen, vpp);
+}
+
+/*
+ * Finish allocating a new inode, given an inode and generation number.
+ */
+int
+lfs_ialloc(struct lfs *fs, struct vnode *pvp, ino_t new_ino, int new_gen,
+	   struct vnode **vpp)
+{
+	struct inode *ip;
+	struct vnode *vp;
+
+	ASSERT_NO_SEGLOCK(fs);
+
+	vp = *vpp;
+	mutex_enter(&ufs_hashlock);
+	/* Create an inode to associate with the vnode. */
+	lfs_vcreate(pvp->v_mount, new_ino, vp);
+
+	ip = VTOI(vp);
+	mutex_enter(&lfs_lock);
+	LFS_SET_UINO(ip, IN_CHANGE);
+	mutex_exit(&lfs_lock);
+	/* on-disk structure has been zeroed out by lfs_vcreate */
+	ip->i_din.ffs1_din->di_inumber = new_ino;
+
+	/* Note no blocks yet */
+	ip->i_lfs_hiblk = -1;
+
+	/* Set a new generation number for this inode. */
+	if (new_gen) {
+		ip->i_gen = new_gen;
+		ip->i_ffs1_gen = new_gen;
+	}
+
+	/* Insert into the inode hash table. */
+	ufs_ihashins(ip);
+	mutex_exit(&ufs_hashlock);
+
+	ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, vpp);
+	vp = *vpp;
+	ip = VTOI(vp);
+
+	memset(ip->i_lfs_fragsize, 0, NDADDR * sizeof(*ip->i_lfs_fragsize));
+
+	uvm_vnp_setsize(vp, 0);
+	lfs_mark_vnode(vp);
+	genfs_node_init(vp, &lfs_genfsops);
+	vref(ip->i_devvp);
+	return (0);
+}
+
+/* Create a new vnode/inode pair and initialize what fields we can. */
+void
+lfs_vcreate(struct mount *mp, ino_t ino, struct vnode *vp)
+{
+	struct inode *ip;
+	struct ufs1_dinode *dp;
+	struct ufsmount *ump;
+
+	/* Get a pointer to the private mount structure. */
+	ump = VFSTOUFS(mp);
+
+	ASSERT_NO_SEGLOCK(ump->um_lfs);
+
+	/* Initialize the inode. */
+	ip = pool_get(&lfs_inode_pool, PR_WAITOK);
+	memset(ip, 0, sizeof(*ip));
+	dp = pool_get(&lfs_dinode_pool, PR_WAITOK);
+	memset(dp, 0, sizeof(*dp));
+	ip->inode_ext.lfs = pool_get(&lfs_inoext_pool, PR_WAITOK);
+	memset(ip->inode_ext.lfs, 0, sizeof(*ip->inode_ext.lfs));
+	vp->v_data = ip;
+	ip->i_din.ffs1_din = dp;
+	ip->i_ump = ump;
+	ip->i_vnode = vp;
+	ip->i_devvp = ump->um_devvp;
+	ip->i_dev = ump->um_dev;
+	ip->i_number = dp->di_inumber = ino;
+	ip->i_lfs = ump->um_lfs;
+	ip->i_lfs_effnblks = 0;
+	SPLAY_INIT(&ip->i_lfs_lbtree);
+	ip->i_lfs_nbtree = 0;
+	LIST_INIT(&ip->i_lfs_segdhd);
+#ifdef QUOTA
+	ufsquota_init(ip);
+#endif
+}
+
+#if 0
+/*
+ * Find the highest-numbered allocated inode.
+ * This will be used to shrink the Ifile.
+ */
+static inline ino_t
+lfs_last_alloc_ino(struct lfs *fs)
+{
+	ino_t ino, maxino;
+
+	maxino = ((fs->lfs_ivnode->v_size >> fs->lfs_bshift) -
+		  fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
+	for (ino = maxino - 1; ino > LFS_UNUSED_INUM; --ino) {
+		if (ISSET_BITMAP_FREE(fs, ino) == 0)
+			break;
+	}
+	return ino;
+}
+#endif
+
+/*
+ * Find the previous (next lowest numbered) free inode, if any.
+ * If there is none, return LFS_UNUSED_INUM.
+ */
+static inline ino_t
+lfs_freelist_prev(struct lfs *fs, ino_t ino)
+{
+	ino_t tino, bound, bb, freehdbb;
+
+	if (fs->lfs_freehd == LFS_UNUSED_INUM)	 /* No free inodes at all */
+		return LFS_UNUSED_INUM;
+
+	/* Search our own word first */
+	bound = ino & ~BMMASK;
+	for (tino = ino - 1; tino >= bound && tino > LFS_UNUSED_INUM; tino--)
+		if (ISSET_BITMAP_FREE(fs, tino))
+			return tino;
+	/* If there are no lower words to search, just return */
+	if (ino >> BMSHIFT == 0)
+		return LFS_UNUSED_INUM;
+
+	/*
+	 * Find a word with a free inode in it.  We have to be a bit
+	 * careful here since ino_t is unsigned.
+	 */
+	freehdbb = (fs->lfs_freehd >> BMSHIFT);
+	for (bb = (ino >> BMSHIFT) - 1; bb >= freehdbb && bb > 0; --bb)
+		if (fs->lfs_ino_bitmap[bb])
+			break;
+	if (fs->lfs_ino_bitmap[bb] == 0)
+		return LFS_UNUSED_INUM;
+
+	/* Search the word we found */
+	for (tino = (bb << BMSHIFT) | BMMASK; tino >= (bb << BMSHIFT) &&
+	     tino > LFS_UNUSED_INUM; tino--)
+		if (ISSET_BITMAP_FREE(fs, tino))
+			break;
+
+	if (tino <= LFS_IFILE_INUM)
+		tino = LFS_UNUSED_INUM;
+
+	return tino;
+}
+
+/* Free an inode. */
+/* ARGUSED */
+/* VOP_BWRITE 2i times */
+int
+lfs_vfree(struct vnode *vp, ino_t ino, int mode)
+{
+	SEGUSE *sup;
+	CLEANERINFO *cip;
+	struct buf *cbp, *bp;
+	struct ifile *ifp;
+	struct inode *ip;
+	struct lfs *fs;
+	daddr_t old_iaddr;
+	ino_t otail;
+
+	/* Get the inode number and file system. */
+	ip = VTOI(vp);
+	fs = ip->i_lfs;
+	ino = ip->i_number;
+
+	ASSERT_NO_SEGLOCK(fs);
+	DLOG((DLOG_ALLOC, "lfs_vfree: free ino %lld\n", (long long)ino));
+
+	/* Drain of pending writes */
+	mutex_enter(vp->v_interlock);
+	while (fs->lfs_version > 1 && WRITEINPROG(vp)) {
+		cv_wait(&vp->v_cv, vp->v_interlock);
+	}
+	mutex_exit(vp->v_interlock);
+
+	lfs_seglock(fs, SEGM_PROT);
+	vn_lock(fs->lfs_ivnode, LK_EXCLUSIVE);
+
+	lfs_unmark_vnode(vp);
+	mutex_enter(&lfs_lock);
+	if (vp->v_uflag & VU_DIROP) {
+		vp->v_uflag &= ~VU_DIROP;
+		--lfs_dirvcount;
+		--fs->lfs_dirvcount;
+		TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+		wakeup(&fs->lfs_dirvcount);
+		wakeup(&lfs_dirvcount);
+		mutex_exit(&lfs_lock);
+		lfs_vunref(vp);
+
+		/*
+		 * If this inode is not going to be written any more, any
+		 * segment accounting left over from its truncation needs
+		 * to occur at the end of the next dirops flush.  Attach
+		 * them to the fs-wide list for that purpose.
+		 */
+		if (LIST_FIRST(&ip->i_lfs_segdhd) != NULL) {
+			struct segdelta *sd;
+	
+			while((sd = LIST_FIRST(&ip->i_lfs_segdhd)) != NULL) {
+				LIST_REMOVE(sd, list);
+				LIST_INSERT_HEAD(&fs->lfs_segdhd, sd, list);
+			}
+		}
+	} else {
+		/*
+		 * If it's not a dirop, we can finalize right away.
+		 */
+		mutex_exit(&lfs_lock);
+		lfs_finalize_ino_seguse(fs, ip);
+	}
+
+	mutex_enter(&lfs_lock);
+	LFS_CLR_UINO(ip, IN_ACCESSED|IN_CLEANING|IN_MODIFIED);
+	mutex_exit(&lfs_lock);
+	ip->i_flag &= ~IN_ALLMOD;
+	ip->i_lfs_iflags |= LFSI_DELETED;
+	
+	/*
+	 * Set the ifile's inode entry to unused, increment its version number
+	 * and link it onto the free chain.
+	 */
+	SET_BITMAP_FREE(fs, ino);
+	LFS_IENTRY(ifp, fs, ino, bp);
+	old_iaddr = ifp->if_daddr;
+	ifp->if_daddr = LFS_UNUSED_DADDR;
+	++ifp->if_version;
+	if (fs->lfs_version == 1) {
+		LFS_GET_HEADFREE(fs, cip, cbp, &(ifp->if_nextfree));
+		LFS_PUT_HEADFREE(fs, cip, cbp, ino);
+		(void) LFS_BWRITE_LOG(bp); /* Ifile */
+	} else {
+		ino_t tino, onf;
+
+		ifp->if_nextfree = LFS_UNUSED_INUM;
+		(void) LFS_BWRITE_LOG(bp); /* Ifile */
+
+		tino = lfs_freelist_prev(fs, ino);
+		if (tino == LFS_UNUSED_INUM) {
+			/* Nothing free below us, put us on the head */
+			LFS_IENTRY(ifp, fs, ino, bp);
+			LFS_GET_HEADFREE(fs, cip, cbp, &(ifp->if_nextfree));
+			LFS_PUT_HEADFREE(fs, cip, cbp, ino);
+			DLOG((DLOG_ALLOC, "lfs_vfree: headfree %lld -> %lld\n",
+			     (long long)ifp->if_nextfree, (long long)ino));
+			LFS_BWRITE_LOG(bp); /* Ifile */
+
+			/* If the list was empty, set tail too */
+			LFS_GET_TAILFREE(fs, cip, cbp, &otail);
+			if (otail == LFS_UNUSED_INUM) {
+				LFS_PUT_TAILFREE(fs, cip, cbp, ino);
+				DLOG((DLOG_ALLOC, "lfs_vfree: tailfree %lld "
+				      "-> %lld\n", (long long)otail,
+				      (long long)ino));
+			}
+		} else {
+			/*
+			 * Insert this inode into the list after tino.
+			 * We hold the segment lock so we don't have to
+			 * worry about blocks being written out of order.
+			 */
+			DLOG((DLOG_ALLOC, "lfs_vfree: insert ino %lld "
+			      " after %lld\n", ino, tino));
+
+			LFS_IENTRY(ifp, fs, tino, bp);
+			onf = ifp->if_nextfree;
+			ifp->if_nextfree = ino;
+			LFS_BWRITE_LOG(bp);	/* Ifile */
+
+			LFS_IENTRY(ifp, fs, ino, bp);
+			ifp->if_nextfree = onf;
+			LFS_BWRITE_LOG(bp);	/* Ifile */
+
+			/* If we're last, put us on the tail */
+			if (onf == LFS_UNUSED_INUM) {
+				LFS_GET_TAILFREE(fs, cip, cbp, &otail);
+				LFS_PUT_TAILFREE(fs, cip, cbp, ino);
+				DLOG((DLOG_ALLOC, "lfs_vfree: tailfree %lld "
+				      "-> %lld\n", (long long)otail,
+				      (long long)ino));
+			}
+		}
+	}
+#ifdef DIAGNOSTIC
+	if (ino == LFS_UNUSED_INUM) {
+		panic("inode 0 freed");
+	}
+#endif /* DIAGNOSTIC */
+	if (old_iaddr != LFS_UNUSED_DADDR) {
+		LFS_SEGENTRY(sup, fs, dtosn(fs, old_iaddr), bp);
+#ifdef DIAGNOSTIC
+		if (sup->su_nbytes < sizeof (struct ufs1_dinode)) {
+			printf("lfs_vfree: negative byte count"
+			       " (segment %" PRIu32 " short by %d)\n",
+			       dtosn(fs, old_iaddr),
+			       (int)sizeof (struct ufs1_dinode) -
+				    sup->su_nbytes);
+			panic("lfs_vfree: negative byte count");
+			sup->su_nbytes = sizeof (struct ufs1_dinode);
+		}
+#endif
+		sup->su_nbytes -= sizeof (struct ufs1_dinode);
+		LFS_WRITESEGENTRY(sup, fs, dtosn(fs, old_iaddr), bp); /* Ifile */
+	}
+
+	/* Set superblock modified bit and decrement file count. */
+	mutex_enter(&lfs_lock);
+	fs->lfs_fmod = 1;
+	mutex_exit(&lfs_lock);
+	--fs->lfs_nfiles;
+
+	VOP_UNLOCK(fs->lfs_ivnode);
+	lfs_segunlock(fs);
+
+	return (0);
+}
+
+/*
+ * Sort the freelist and set up the free-inode bitmap.
+ * To be called by lfs_mountfs().
+ */
+void
+lfs_order_freelist(struct lfs *fs)
+{
+	CLEANERINFO *cip;
+	IFILE *ifp = NULL;
+	struct buf *bp;
+	ino_t ino, firstino, lastino, maxino;
+#ifdef notyet
+	struct vnode *vp;
+#endif
+	
+	ASSERT_NO_SEGLOCK(fs);
+	lfs_seglock(fs, SEGM_PROT);
+
+	maxino = ((fs->lfs_ivnode->v_size >> fs->lfs_bshift) -
+		  fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
+	fs->lfs_ino_bitmap = (lfs_bm_t *)
+		malloc(((maxino + BMMASK) >> BMSHIFT) * sizeof(lfs_bm_t),
+		       M_SEGMENT, M_WAITOK | M_ZERO);
+	KASSERT(fs->lfs_ino_bitmap != NULL);
+
+	firstino = lastino = LFS_UNUSED_INUM;
+	for (ino = 0; ino < maxino; ino++) {
+		if (ino % fs->lfs_ifpb == 0)
+			LFS_IENTRY(ifp, fs, ino, bp);
+		else
+			++ifp;
+
+		/* Don't put zero or ifile on the free list */
+		if (ino == LFS_UNUSED_INUM || ino == LFS_IFILE_INUM)
+			continue;
+
+#ifdef notyet
+		/* Address orphaned files */
+		if (ifp->if_nextfree == LFS_ORPHAN_NEXTFREE &&
+		    VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp) == 0) {
+			lfs_truncate(vp, 0, 0, NOCRED);
+			vput(vp);
+			LFS_SEGENTRY(sup, fs, dtosn(fs, ifp->if_daddr), bp);
+			KASSERT(sup->su_nbytes >= DINODE1_SIZE);
+			sup->su_nbytes -= DINODE1_SIZE;
+			LFS_WRITESEGENTRY(sup, fs, dtosn(fs, ifp->if_daddr), bp);
+
+			/* Set up to fall through to next section */
+			ifp->if_daddr = LFS_UNUSED_DADDR;
+			LFS_BWRITE_LOG(bp);
+			LFS_IENTRY(ifp, fs, ino, bp);
+		}
+#endif
+
+		if (ifp->if_daddr == LFS_UNUSED_DADDR) {
+			if (firstino == LFS_UNUSED_INUM)
+				firstino = ino;
+			else {
+				brelse(bp, 0);
+
+				LFS_IENTRY(ifp, fs, lastino, bp);
+				ifp->if_nextfree = ino;
+				LFS_BWRITE_LOG(bp);
+				
+				LFS_IENTRY(ifp, fs, ino, bp);
+			}
+			lastino = ino;
+
+			SET_BITMAP_FREE(fs, ino);
+		}
+
+		if ((ino + 1) % fs->lfs_ifpb == 0)
+			brelse(bp, 0);
+	}
+
+	LFS_PUT_HEADFREE(fs, cip, bp, firstino);
+	LFS_PUT_TAILFREE(fs, cip, bp, lastino);
+
+	lfs_segunlock(fs);
+}
+
+void
+lfs_orphan(struct lfs *fs, ino_t ino)
+{
+	IFILE *ifp;
+	struct buf *bp;
+
+	LFS_IENTRY(ifp, fs, ino, bp);
+	ifp->if_nextfree = LFS_ORPHAN_NEXTFREE;
+	LFS_BWRITE_LOG(bp);
+}
diff --git a/sys/ufs/lfs/lfs_balloc.c b/sys/ufs/lfs/lfs_balloc.c
new file mode 100644
index 000000000..d46ba0570
--- /dev/null
+++ b/sys/ufs/lfs/lfs_balloc.c
@@ -0,0 +1,582 @@
+/*	$NetBSD: lfs_balloc.c,v 1.70 2011/07/11 08:27:40 hannken Exp $	*/
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)lfs_balloc.c	8.4 (Berkeley) 5/8/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_balloc.c,v 1.70 2011/07/11 08:27:40 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/tree.h>
+#include <sys/trace.h>
+#include <sys/kauth.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+
+int lfs_fragextend(struct vnode *, int, int, daddr_t, struct buf **, kauth_cred_t);
+
+u_int64_t locked_fakequeue_count;
+
+/*
+ * Allocate a block, and to inode and filesystem block accounting for it
+ * and for any indirect blocks the may need to be created in order for
+ * this block to be created.
+ *
+ * Blocks which have never been accounted for (i.e., which "do not exist")
+ * have disk address 0, which is translated by ufs_bmap to the special value
+ * UNASSIGNED == -1, as in the historical UFS.
+ *
+ * Blocks which have been accounted for but which have not yet been written
+ * to disk are given the new special disk address UNWRITTEN == -2, so that
+ * they can be differentiated from completely new blocks.
+ */
+/* VOP_BWRITE NIADDR+2 times */
+int
+lfs_balloc(struct vnode *vp, off_t startoffset, int iosize, kauth_cred_t cred,
+    int flags, struct buf **bpp)
+{
+	int offset;
+	daddr_t daddr, idaddr;
+	struct buf *ibp, *bp;
+	struct inode *ip;
+	struct lfs *fs;
+	struct indir indirs[NIADDR+2], *idp;
+	daddr_t	lbn, lastblock;
+	int bcount;
+	int error, frags, i, nsize, osize, num;
+
+	ip = VTOI(vp);
+	fs = ip->i_lfs;
+	offset = blkoff(fs, startoffset);
+	KASSERT(iosize <= fs->lfs_bsize);
+	lbn = lblkno(fs, startoffset);
+	/* (void)lfs_check(vp, lbn, 0); */
+
+	ASSERT_MAYBE_SEGLOCK(fs);
+
+	/*
+	 * Three cases: it's a block beyond the end of file, it's a block in
+	 * the file that may or may not have been assigned a disk address or
+	 * we're writing an entire block.
+	 *
+	 * Note, if the daddr is UNWRITTEN, the block already exists in
+	 * the cache (it was read or written earlier).	If so, make sure
+	 * we don't count it as a new block or zero out its contents. If
+	 * it did not, make sure we allocate any necessary indirect
+	 * blocks.
+	 *
+	 * If we are writing a block beyond the end of the file, we need to
+	 * check if the old last block was a fragment.	If it was, we need
+	 * to rewrite it.
+	 */
+
+	if (bpp)
+		*bpp = NULL;
+
+	/* Check for block beyond end of file and fragment extension needed. */
+	lastblock = lblkno(fs, ip->i_size);
+	if (lastblock < NDADDR && lastblock < lbn) {
+		osize = blksize(fs, ip, lastblock);
+		if (osize < fs->lfs_bsize && osize > 0) {
+			if ((error = lfs_fragextend(vp, osize, fs->lfs_bsize,
+						    lastblock,
+						    (bpp ? &bp : NULL), cred)))
+				return (error);
+			ip->i_ffs1_size = ip->i_size =
+			    (lastblock + 1) * fs->lfs_bsize;
+			uvm_vnp_setsize(vp, ip->i_size);
+			ip->i_flag |= IN_CHANGE | IN_UPDATE;
+			if (bpp)
+				(void) VOP_BWRITE(bp->b_vp, bp);
+		}
+	}
+
+	/*
+	 * If the block we are writing is a direct block, it's the last
+	 * block in the file, and offset + iosize is less than a full
+	 * block, we can write one or more fragments.  There are two cases:
+	 * the block is brand new and we should allocate it the correct
+	 * size or it already exists and contains some fragments and
+	 * may need to extend it.
+	 */
+	if (lbn < NDADDR && lblkno(fs, ip->i_size) <= lbn) {
+		osize = blksize(fs, ip, lbn);
+		nsize = fragroundup(fs, offset + iosize);
+		if (lblktosize(fs, lbn) >= ip->i_size) {
+			/* Brand new block or fragment */
+			frags = numfrags(fs, nsize);
+			if (!ISSPACE(fs, frags, cred))
+				return ENOSPC;
+			if (bpp) {
+				*bpp = bp = getblk(vp, lbn, nsize, 0, 0);
+				bp->b_blkno = UNWRITTEN;
+				if (flags & B_CLRBUF)
+					clrbuf(bp);
+			}
+			ip->i_lfs_effnblks += frags;
+			mutex_enter(&lfs_lock);
+			fs->lfs_bfree -= frags;
+			mutex_exit(&lfs_lock);
+			ip->i_ffs1_db[lbn] = UNWRITTEN;
+		} else {
+			if (nsize <= osize) {
+				/* No need to extend */
+				if (bpp && (error = bread(vp, lbn, osize,
+				    NOCRED, 0, &bp)))
+					return error;
+			} else {
+				/* Extend existing block */
+				if ((error =
+				     lfs_fragextend(vp, osize, nsize, lbn,
+						    (bpp ? &bp : NULL), cred)))
+					return error;
+			}
+			if (bpp)
+				*bpp = bp;
+		}
+		return 0;
+	}
+
+	error = ufs_bmaparray(vp, lbn, &daddr, &indirs[0], &num, NULL, NULL);
+	if (error)
+		return (error);
+
+	daddr = (daddr_t)((int32_t)daddr); /* XXX ondisk32 */
+	KASSERT(daddr <= LFS_MAX_DADDR);
+
+	/*
+	 * Do byte accounting all at once, so we can gracefully fail *before*
+	 * we start assigning blocks.
+	 */
+	frags = VFSTOUFS(vp->v_mount)->um_seqinc;
+	bcount = 0;
+	if (daddr == UNASSIGNED) {
+		bcount = frags;
+	}
+	for (i = 1; i < num; ++i) {
+		if (!indirs[i].in_exists) {
+			bcount += frags;
+		}
+	}
+	if (ISSPACE(fs, bcount, cred)) {
+		mutex_enter(&lfs_lock);
+		fs->lfs_bfree -= bcount;
+		mutex_exit(&lfs_lock);
+		ip->i_lfs_effnblks += bcount;
+	} else {
+		return ENOSPC;
+	}
+
+	if (daddr == UNASSIGNED) {
+		if (num > 0 && ip->i_ffs1_ib[indirs[0].in_off] == 0) {
+			ip->i_ffs1_ib[indirs[0].in_off] = UNWRITTEN;
+		}
+
+		/*
+		 * Create new indirect blocks if necessary
+		 */
+		if (num > 1) {
+			idaddr = ip->i_ffs1_ib[indirs[0].in_off];
+			for (i = 1; i < num; ++i) {
+				ibp = getblk(vp, indirs[i].in_lbn,
+				    fs->lfs_bsize, 0,0);
+				if (!indirs[i].in_exists) {
+					clrbuf(ibp);
+					ibp->b_blkno = UNWRITTEN;
+				} else if (!(ibp->b_oflags & (BO_DELWRI | BO_DONE))) {
+					ibp->b_blkno = fsbtodb(fs, idaddr);
+					ibp->b_flags |= B_READ;
+					VOP_STRATEGY(vp, ibp);
+					biowait(ibp);
+				}
+				/*
+				 * This block exists, but the next one may not.
+				 * If that is the case mark it UNWRITTEN to keep
+				 * the accounting straight.
+				 */
+				/* XXX ondisk32 */
+				if (((int32_t *)ibp->b_data)[indirs[i].in_off] == 0)
+					((int32_t *)ibp->b_data)[indirs[i].in_off] =
+						UNWRITTEN;
+				/* XXX ondisk32 */
+				idaddr = ((int32_t *)ibp->b_data)[indirs[i].in_off];
+#ifdef DEBUG
+				if (vp == fs->lfs_ivnode) {
+					LFS_ENTER_LOG("balloc", __FILE__,
+						__LINE__, indirs[i].in_lbn,
+						ibp->b_flags, curproc->p_pid);
+				}
+#endif
+				if ((error = VOP_BWRITE(ibp->b_vp, ibp)))
+					return error;
+			}
+		}
+	}
+
+
+	/*
+	 * Get the existing block from the cache, if requested.
+	 */
+	if (bpp)
+		*bpp = bp = getblk(vp, lbn, blksize(fs, ip, lbn), 0, 0);
+
+	/*
+	 * Do accounting on blocks that represent pages.
+	 */
+	if (!bpp)
+		lfs_register_block(vp, lbn);
+
+	/*
+	 * The block we are writing may be a brand new block
+	 * in which case we need to do accounting.
+	 *
+	 * We can tell a truly new block because ufs_bmaparray will say
+	 * it is UNASSIGNED.  Once we allocate it we will assign it the
+	 * disk address UNWRITTEN.
+	 */
+	if (daddr == UNASSIGNED) {
+		if (bpp) {
+			if (flags & B_CLRBUF)
+				clrbuf(bp);
+
+			/* Note the new address */
+			bp->b_blkno = UNWRITTEN;
+		}
+
+		switch (num) {
+		    case 0:
+			ip->i_ffs1_db[lbn] = UNWRITTEN;
+			break;
+		    case 1:
+			ip->i_ffs1_ib[indirs[0].in_off] = UNWRITTEN;
+			break;
+		    default:
+			idp = &indirs[num - 1];
+			if (bread(vp, idp->in_lbn, fs->lfs_bsize, NOCRED,
+				  B_MODIFY, &ibp))
+				panic("lfs_balloc: bread bno %lld",
+				    (long long)idp->in_lbn);
+			/* XXX ondisk32 */
+			((int32_t *)ibp->b_data)[idp->in_off] = UNWRITTEN;
+#ifdef DEBUG
+			if (vp == fs->lfs_ivnode) {
+				LFS_ENTER_LOG("balloc", __FILE__,
+					__LINE__, idp->in_lbn,
+					ibp->b_flags, curproc->p_pid);
+			}
+#endif
+			VOP_BWRITE(ibp->b_vp, ibp);
+		}
+	} else if (bpp && !(bp->b_oflags & (BO_DONE|BO_DELWRI))) {
+		/*
+		 * Not a brand new block, also not in the cache;
+		 * read it in from disk.
+		 */
+		if (iosize == fs->lfs_bsize)
+			/* Optimization: I/O is unnecessary. */
+			bp->b_blkno = daddr;
+		else {
+			/*
+			 * We need to read the block to preserve the
+			 * existing bytes.
+			 */
+			bp->b_blkno = daddr;
+			bp->b_flags |= B_READ;
+			VOP_STRATEGY(vp, bp);
+			return (biowait(bp));
+		}
+	}
+
+	return (0);
+}
+
+/* VOP_BWRITE 1 time */
+int
+lfs_fragextend(struct vnode *vp, int osize, int nsize, daddr_t lbn, struct buf **bpp,
+    kauth_cred_t cred)
+{
+	struct inode *ip;
+	struct lfs *fs;
+	long frags;
+	int error;
+	extern long locked_queue_bytes;
+	size_t obufsize;
+
+	ip = VTOI(vp);
+	fs = ip->i_lfs;
+	frags = (long)numfrags(fs, nsize - osize);
+	error = 0;
+
+	ASSERT_NO_SEGLOCK(fs);
+
+	/*
+	 * Get the seglock so we don't enlarge blocks while a segment
+	 * is being written.  If we're called with bpp==NULL, though,
+	 * we are only pretending to change a buffer, so we don't have to
+	 * lock.
+	 */
+    top:
+	if (bpp) {
+		rw_enter(&fs->lfs_fraglock, RW_READER);
+		LFS_DEBUG_COUNTLOCKED("frag");
+	}
+
+	if (!ISSPACE(fs, frags, cred)) {
+		error = ENOSPC;
+		goto out;
+	}
+
+	/*
+	 * If we are not asked to actually return the block, all we need
+	 * to do is allocate space for it.  UBC will handle dirtying the
+	 * appropriate things and making sure it all goes to disk.
+	 * Don't bother to read in that case.
+	 */
+	if (bpp && (error = bread(vp, lbn, osize, NOCRED, 0, bpp))) {
+		brelse(*bpp, 0);
+		goto out;
+	}
+#ifdef QUOTA
+	if ((error = chkdq(ip, frags, cred, 0))) {
+		if (bpp)
+			brelse(*bpp, 0);
+		goto out;
+	}
+#endif
+	/*
+	 * Adjust accounting for lfs_avail.  If there's not enough room,
+	 * we will have to wait for the cleaner, which we can't do while
+	 * holding a block busy or while holding the seglock.  In that case,
+	 * release both and start over after waiting.
+	 */
+
+	if (bpp && ((*bpp)->b_oflags & BO_DELWRI)) {
+		if (!lfs_fits(fs, frags)) {
+			if (bpp)
+				brelse(*bpp, 0);
+#ifdef QUOTA
+			chkdq(ip, -frags, cred, 0);
+#endif
+			rw_exit(&fs->lfs_fraglock);
+			lfs_availwait(fs, frags);
+			goto top;
+		}
+		fs->lfs_avail -= frags;
+	}
+
+	mutex_enter(&lfs_lock);
+	fs->lfs_bfree -= frags;
+	mutex_exit(&lfs_lock);
+	ip->i_lfs_effnblks += frags;
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+
+	if (bpp) {
+		obufsize = (*bpp)->b_bufsize;
+		allocbuf(*bpp, nsize, 1);
+
+		/* Adjust locked-list accounting */
+		if (((*bpp)->b_flags & B_LOCKED) != 0 &&
+		    (*bpp)->b_iodone == NULL) {
+			mutex_enter(&lfs_lock);
+			locked_queue_bytes += (*bpp)->b_bufsize - obufsize;
+			mutex_exit(&lfs_lock);
+		}
+
+		memset((char *)((*bpp)->b_data) + osize, 0, (u_int)(nsize - osize));
+	}
+
+    out:
+	if (bpp) {
+		rw_exit(&fs->lfs_fraglock);
+	}
+	return (error);
+}
+
+static inline int
+lge(struct lbnentry *a, struct lbnentry *b)
+{
+	return a->lbn - b->lbn;
+}
+
+SPLAY_PROTOTYPE(lfs_splay, lbnentry, entry, lge);
+
+SPLAY_GENERATE(lfs_splay, lbnentry, entry, lge);
+
+/*
+ * Record this lbn as being "write pending".  We used to have this information
+ * on the buffer headers, but since pages don't have buffer headers we
+ * record it here instead.
+ */
+void
+lfs_register_block(struct vnode *vp, daddr_t lbn)
+{
+	struct lfs *fs;
+	struct inode *ip;
+	struct lbnentry *lbp;
+
+	ip = VTOI(vp);
+
+	/* Don't count metadata */
+	if (lbn < 0 || vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM)
+		return;
+
+	fs = ip->i_lfs;
+
+	ASSERT_NO_SEGLOCK(fs);
+
+	/* If no space, wait for the cleaner */
+	lfs_availwait(fs, btofsb(fs, 1 << fs->lfs_bshift));
+
+	lbp = (struct lbnentry *)pool_get(&lfs_lbnentry_pool, PR_WAITOK);
+	lbp->lbn = lbn;
+	mutex_enter(&lfs_lock);
+	if (SPLAY_INSERT(lfs_splay, &ip->i_lfs_lbtree, lbp) != NULL) {
+		mutex_exit(&lfs_lock);
+		/* Already there */
+		pool_put(&lfs_lbnentry_pool, lbp);
+		return;
+	}
+
+	++ip->i_lfs_nbtree;
+	fs->lfs_favail += btofsb(fs, (1 << fs->lfs_bshift));
+	fs->lfs_pages += fs->lfs_bsize >> PAGE_SHIFT;
+	++locked_fakequeue_count;
+	lfs_subsys_pages += fs->lfs_bsize >> PAGE_SHIFT;
+	mutex_exit(&lfs_lock);
+}
+
+static void
+lfs_do_deregister(struct lfs *fs, struct inode *ip, struct lbnentry *lbp)
+{
+	ASSERT_MAYBE_SEGLOCK(fs);
+
+	mutex_enter(&lfs_lock);
+	--ip->i_lfs_nbtree;
+	SPLAY_REMOVE(lfs_splay, &ip->i_lfs_lbtree, lbp);
+	if (fs->lfs_favail > btofsb(fs, (1 << fs->lfs_bshift)))
+		fs->lfs_favail -= btofsb(fs, (1 << fs->lfs_bshift));
+	fs->lfs_pages -= fs->lfs_bsize >> PAGE_SHIFT;
+	if (locked_fakequeue_count > 0)
+		--locked_fakequeue_count;
+	lfs_subsys_pages -= fs->lfs_bsize >> PAGE_SHIFT;
+	mutex_exit(&lfs_lock);
+
+	pool_put(&lfs_lbnentry_pool, lbp);
+}
+
+void
+lfs_deregister_block(struct vnode *vp, daddr_t lbn)
+{
+	struct lfs *fs;
+	struct inode *ip;
+	struct lbnentry *lbp;
+	struct lbnentry tmp;
+
+	ip = VTOI(vp);
+
+	/* Don't count metadata */
+	if (lbn < 0 || vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM)
+		return;
+
+	fs = ip->i_lfs;
+	tmp.lbn = lbn;
+	lbp = SPLAY_FIND(lfs_splay, &ip->i_lfs_lbtree, &tmp);
+	if (lbp == NULL)
+		return;
+
+	lfs_do_deregister(fs, ip, lbp);
+}
+
+void
+lfs_deregister_all(struct vnode *vp)
+{
+	struct lbnentry *lbp, *nlbp;
+	struct lfs_splay *hd;
+	struct lfs *fs;
+	struct inode *ip;
+
+	ip = VTOI(vp);
+	fs = ip->i_lfs;
+	hd = &ip->i_lfs_lbtree;
+
+	for (lbp = SPLAY_MIN(lfs_splay, hd); lbp != NULL; lbp = nlbp) {
+		nlbp = SPLAY_NEXT(lfs_splay, hd, lbp);
+		lfs_do_deregister(fs, ip, lbp);
+	}
+}
diff --git a/sys/ufs/lfs/lfs_bio.c b/sys/ufs/lfs/lfs_bio.c
new file mode 100644
index 000000000..fe3d4b52e
--- /dev/null
+++ b/sys/ufs/lfs/lfs_bio.c
@@ -0,0 +1,858 @@
+/*	$NetBSD: lfs_bio.c,v 1.120 2011/07/11 08:27:40 hannken Exp $	*/
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)lfs_bio.c	8.10 (Berkeley) 6/10/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.120 2011/07/11 08:27:40 hannken Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/resourcevar.h>
+#include <sys/mount.h>
+#include <sys/kernel.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+
+/*
+ * LFS block write function.
+ *
+ * XXX
+ * No write cost accounting is done.
+ * This is almost certainly wrong for synchronous operations and NFS.
+ *
+ * protected by lfs_lock.
+ */
+int	locked_queue_count   = 0;	/* Count of locked-down buffers. */
+long	locked_queue_bytes   = 0L;	/* Total size of locked buffers. */
+int	lfs_subsys_pages     = 0L;	/* Total number LFS-written pages */
+int	lfs_fs_pagetrip	     = 0;	/* # of pages to trip per-fs write */
+int	lfs_writing	     = 0;	/* Set if already kicked off a writer
+					   because of buffer space */
+
+/* Lock and condition variables for above. */
+kcondvar_t	locked_queue_cv;
+kcondvar_t	lfs_writing_cv;
+kmutex_t	lfs_lock;
+
+extern int lfs_dostats;
+
+/*
+ * reserved number/bytes of locked buffers
+ */
+int locked_queue_rcount = 0;
+long locked_queue_rbytes = 0L;
+
+static int lfs_fits_buf(struct lfs *, int, int);
+static int lfs_reservebuf(struct lfs *, struct vnode *vp, struct vnode *vp2,
+    int, int);
+static int lfs_reserveavail(struct lfs *, struct vnode *vp, struct vnode *vp2,
+    int);
+
+static int
+lfs_fits_buf(struct lfs *fs, int n, int bytes)
+{
+	int count_fit, bytes_fit;
+
+	ASSERT_NO_SEGLOCK(fs);
+	KASSERT(mutex_owned(&lfs_lock));
+
+	count_fit =
+	    (locked_queue_count + locked_queue_rcount + n <= LFS_WAIT_BUFS);
+	bytes_fit =
+	    (locked_queue_bytes + locked_queue_rbytes + bytes <= LFS_WAIT_BYTES);
+
+#ifdef DEBUG
+	if (!count_fit) {
+		DLOG((DLOG_AVAIL, "lfs_fits_buf: no fit count: %d + %d + %d >= %d\n",
+		      locked_queue_count, locked_queue_rcount,
+		      n, LFS_WAIT_BUFS));
+	}
+	if (!bytes_fit) {
+		DLOG((DLOG_AVAIL, "lfs_fits_buf: no fit bytes: %ld + %ld + %d >= %ld\n",
+		      locked_queue_bytes, locked_queue_rbytes,
+		      bytes, LFS_WAIT_BYTES));
+	}
+#endif /* DEBUG */
+
+	return (count_fit && bytes_fit);
+}
+
+/* ARGSUSED */
+static int
+lfs_reservebuf(struct lfs *fs, struct vnode *vp,
+    struct vnode *vp2, int n, int bytes)
+{
+	ASSERT_MAYBE_SEGLOCK(fs);
+	KASSERT(locked_queue_rcount >= 0);
+	KASSERT(locked_queue_rbytes >= 0);
+
+	mutex_enter(&lfs_lock);
+	while (n > 0 && !lfs_fits_buf(fs, n, bytes)) {
+		int error;
+
+		lfs_flush(fs, 0, 0);
+
+		error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock,
+		    hz * LFS_BUFWAIT);
+		if (error && error != EWOULDBLOCK) {
+			mutex_exit(&lfs_lock);
+			return error;
+		}
+	}
+
+	locked_queue_rcount += n;
+	locked_queue_rbytes += bytes;
+
+	if (n < 0)
+		cv_broadcast(&locked_queue_cv);
+
+	mutex_exit(&lfs_lock);
+
+	KASSERT(locked_queue_rcount >= 0);
+	KASSERT(locked_queue_rbytes >= 0);
+
+	return 0;
+}
+
+/*
+ * Try to reserve some blocks, prior to performing a sensitive operation that
+ * requires the vnode lock to be honored.  If there is not enough space, give
+ * up the vnode lock temporarily and wait for the space to become available.
+ *
+ * Called with vp locked.  (Note nowever that if fsb < 0, vp is ignored.)
+ *
+ * XXX YAMT - it isn't safe to unlock vp here
+ * because the node might be modified while we sleep.
+ * (eg. cached states like i_offset might be stale,
+ *  the vnode might be truncated, etc..)
+ * maybe we should have a way to restart the vnodeop (EVOPRESTART?)
+ * or rearrange vnodeop interface to leave vnode locking to file system
+ * specific code so that each file systems can have their own vnode locking and
+ * vnode re-using strategies.
+ */
+static int
+lfs_reserveavail(struct lfs *fs, struct vnode *vp,
+    struct vnode *vp2, int fsb)
+{
+	CLEANERINFO *cip;
+	struct buf *bp;
+	int error, slept;
+
+	ASSERT_MAYBE_SEGLOCK(fs);
+	slept = 0;
+	mutex_enter(&lfs_lock);
+	while (fsb > 0 && !lfs_fits(fs, fsb + fs->lfs_ravail + fs->lfs_favail)) {
+		mutex_exit(&lfs_lock);
+#if 0
+		/*
+		 * XXX ideally, we should unlock vnodes here
+		 * because we might sleep very long time.
+		 */
+		VOP_UNLOCK(vp);
+		if (vp2 != NULL) {
+			VOP_UNLOCK(vp2);
+		}
+#else
+		/*
+		 * XXX since we'll sleep for cleaner with vnode lock holding,
+		 * deadlock will occur if cleaner tries to lock the vnode.
+		 * (eg. lfs_markv -> lfs_fastvget -> getnewvnode -> vclean)
+		 */
+#endif
+
+		if (!slept) {
+			DLOG((DLOG_AVAIL, "lfs_reserve: waiting for %ld (bfree = %d,"
+			      " est_bfree = %d)\n",
+			      fsb + fs->lfs_ravail + fs->lfs_favail,
+			      fs->lfs_bfree, LFS_EST_BFREE(fs)));
+		}
+		++slept;
+
+		/* Wake up the cleaner */
+		LFS_CLEANERINFO(cip, fs, bp);
+		LFS_SYNC_CLEANERINFO(cip, fs, bp, 0);
+		lfs_wakeup_cleaner(fs);
+
+		mutex_enter(&lfs_lock);
+		/* Cleaner might have run while we were reading, check again */
+		if (lfs_fits(fs, fsb + fs->lfs_ravail + fs->lfs_favail))
+			break;
+
+		error = mtsleep(&fs->lfs_avail, PCATCH | PUSER, "lfs_reserve",
+				0, &lfs_lock);
+#if 0
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX use lockstatus */
+		vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY); /* XXX use lockstatus */
+#endif
+		if (error) {
+			mutex_exit(&lfs_lock);
+			return error;
+		}
+	}
+#ifdef DEBUG
+	if (slept) {
+		DLOG((DLOG_AVAIL, "lfs_reserve: woke up\n"));
+	}
+#endif
+	fs->lfs_ravail += fsb;
+	mutex_exit(&lfs_lock);
+
+	return 0;
+}
+
+#ifdef DIAGNOSTIC
+int lfs_rescount;
+int lfs_rescountdirop;
+#endif
+
+int
+lfs_reserve(struct lfs *fs, struct vnode *vp, struct vnode *vp2, int fsb)
+{
+	int error;
+	int cantwait;
+
+	ASSERT_MAYBE_SEGLOCK(fs);
+	if (vp2) {
+		/* Make sure we're not in the process of reclaiming vp2 */
+		mutex_enter(&lfs_lock);
+		while(fs->lfs_flags & LFS_UNDIROP) {
+			mtsleep(&fs->lfs_flags, PRIBIO + 1, "lfsrundirop", 0,
+			    &lfs_lock);
+		}
+		mutex_exit(&lfs_lock);
+	}
+
+	KASSERT(fsb < 0 || VOP_ISLOCKED(vp));
+	KASSERT(vp2 == NULL || fsb < 0 || VOP_ISLOCKED(vp2));
+	KASSERT(vp2 == NULL || !(VTOI(vp2)->i_flag & IN_ADIROP));
+	KASSERT(vp2 == NULL || vp2 != fs->lfs_unlockvp);
+
+	cantwait = (VTOI(vp)->i_flag & IN_ADIROP) || fs->lfs_unlockvp == vp;
+#ifdef DIAGNOSTIC
+	if (cantwait) {
+		if (fsb > 0)
+			lfs_rescountdirop++;
+		else if (fsb < 0)
+			lfs_rescountdirop--;
+		if (lfs_rescountdirop < 0)
+			panic("lfs_rescountdirop");
+	}
+	else {
+		if (fsb > 0)
+			lfs_rescount++;
+		else if (fsb < 0)
+			lfs_rescount--;
+		if (lfs_rescount < 0)
+			panic("lfs_rescount");
+	}
+#endif
+	if (cantwait)
+		return 0;
+
+	/*
+	 * XXX
+	 * vref vnodes here so that cleaner doesn't try to reuse them.
+	 * (see XXX comment in lfs_reserveavail)
+	 */
+	vhold(vp);
+	if (vp2 != NULL) {
+		vhold(vp2);
+	}
+
+	error = lfs_reserveavail(fs, vp, vp2, fsb);
+	if (error)
+		goto done;
+
+	/*
+	 * XXX just a guess. should be more precise.
+	 */
+	error = lfs_reservebuf(fs, vp, vp2, fsb, fsbtob(fs, fsb));
+	if (error)
+		lfs_reserveavail(fs, vp, vp2, -fsb);
+
+done:
+	holdrele(vp);
+	if (vp2 != NULL) {
+		holdrele(vp2);
+	}
+
+	return error;
+}
+
+int
+lfs_bwrite(void *v)
+{
+	struct vop_bwrite_args /* {
+		struct vnode *a_vp;
+		struct buf *a_bp;
+	} */ *ap = v;
+	struct buf *bp = ap->a_bp;
+
+#ifdef DIAGNOSTIC
+	if (VTOI(bp->b_vp)->i_lfs->lfs_ronly == 0 && (bp->b_flags & B_ASYNC)) {
+		panic("bawrite LFS buffer");
+	}
+#endif /* DIAGNOSTIC */
+	return lfs_bwrite_ext(bp, 0);
+}
+
+/*
+ * Determine if there is enough room currently available to write fsb
+ * blocks.  We need enough blocks for the new blocks, the current
+ * inode blocks (including potentially the ifile inode), a summary block,
+ * and the segment usage table, plus an ifile block.
+ */
+int
+lfs_fits(struct lfs *fs, int fsb)
+{
+	int needed;
+
+	ASSERT_NO_SEGLOCK(fs);
+	needed = fsb + btofsb(fs, fs->lfs_sumsize) +
+		 ((howmany(fs->lfs_uinodes + 1, INOPB(fs)) + fs->lfs_segtabsz +
+		   1) << (fs->lfs_bshift - fs->lfs_ffshift));
+
+	if (needed >= fs->lfs_avail) {
+#ifdef DEBUG
+		DLOG((DLOG_AVAIL, "lfs_fits: no fit: fsb = %ld, uinodes = %ld, "
+		      "needed = %ld, avail = %ld\n",
+		      (long)fsb, (long)fs->lfs_uinodes, (long)needed,
+		      (long)fs->lfs_avail));
+#endif
+		return 0;
+	}
+	return 1;
+}
+
+int
+lfs_availwait(struct lfs *fs, int fsb)
+{
+	int error;
+	CLEANERINFO *cip;
+	struct buf *cbp;
+
+	ASSERT_NO_SEGLOCK(fs);
+	/* Push cleaner blocks through regardless */
+	mutex_enter(&lfs_lock);
+	if (LFS_SEGLOCK_HELD(fs) &&
+	    fs->lfs_sp->seg_flags & (SEGM_CLEAN | SEGM_FORCE_CKP)) {
+		mutex_exit(&lfs_lock);
+		return 0;
+	}
+	mutex_exit(&lfs_lock);
+
+	while (!lfs_fits(fs, fsb)) {
+		/*
+		 * Out of space, need cleaner to run.
+		 * Update the cleaner info, then wake it up.
+		 * Note the cleanerinfo block is on the ifile
+		 * so it CANT_WAIT.
+		 */
+		LFS_CLEANERINFO(cip, fs, cbp);
+		LFS_SYNC_CLEANERINFO(cip, fs, cbp, 0);
+
+#ifdef DEBUG
+		DLOG((DLOG_AVAIL, "lfs_availwait: out of available space, "
+		      "waiting on cleaner\n"));
+#endif
+
+		lfs_wakeup_cleaner(fs);
+#ifdef DIAGNOSTIC
+		if (LFS_SEGLOCK_HELD(fs))
+			panic("lfs_availwait: deadlock");
+#endif
+		error = tsleep(&fs->lfs_avail, PCATCH | PUSER, "cleaner", 0);
+		if (error)
+			return (error);
+	}
+	return 0;
+}
+
+int
+lfs_bwrite_ext(struct buf *bp, int flags)
+{
+	struct lfs *fs;
+	struct inode *ip;
+	struct vnode *vp;
+	int fsb;
+
+	vp = bp->b_vp;
+	fs = VFSTOUFS(vp->v_mount)->um_lfs;
+
+	ASSERT_MAYBE_SEGLOCK(fs);
+	KASSERT(bp->b_cflags & BC_BUSY);
+	KASSERT(flags & BW_CLEAN || !LFS_IS_MALLOC_BUF(bp));
+	KASSERT(((bp->b_oflags | bp->b_flags) & (BO_DELWRI|B_LOCKED))
+	    != BO_DELWRI);
+
+	/*
+	 * Don't write *any* blocks if we're mounted read-only, or
+	 * if we are "already unmounted".
+	 *
+	 * In particular the cleaner can't write blocks either.
+	 */
+	if (fs->lfs_ronly || (fs->lfs_pflags & LFS_PF_CLEAN)) {
+		bp->b_oflags &= ~BO_DELWRI;
+		bp->b_flags |= B_READ;
+		bp->b_error = 0;
+		mutex_enter(&bufcache_lock);
+		LFS_UNLOCK_BUF(bp);
+		if (LFS_IS_MALLOC_BUF(bp))
+			bp->b_cflags &= ~BC_BUSY;
+		else
+			brelsel(bp, 0);
+		mutex_exit(&bufcache_lock);
+		return (fs->lfs_ronly ? EROFS : 0);
+	}
+
+	/*
+	 * Set the delayed write flag and use reassignbuf to move the buffer
+	 * from the clean list to the dirty one.
+	 *
+	 * Set the B_LOCKED flag and unlock the buffer, causing brelse to move
+	 * the buffer onto the LOCKED free list.  This is necessary, otherwise
+	 * getnewbuf() would try to reclaim the buffers using bawrite, which
+	 * isn't going to work.
+	 *
+	 * XXX we don't let meta-data writes run out of space because they can
+	 * come from the segment writer.  We need to make sure that there is
+	 * enough space reserved so that there's room to write meta-data
+	 * blocks.
+	 */
+	if ((bp->b_flags & B_LOCKED) == 0) {
+		fsb = numfrags(fs, bp->b_bcount);
+
+		ip = VTOI(vp);
+		mutex_enter(&lfs_lock);
+		if (flags & BW_CLEAN) {
+			LFS_SET_UINO(ip, IN_CLEANING);
+		} else {
+			LFS_SET_UINO(ip, IN_MODIFIED);
+		}
+		mutex_exit(&lfs_lock);
+		fs->lfs_avail -= fsb;
+
+		mutex_enter(&bufcache_lock);
+		mutex_enter(vp->v_interlock);
+		bp->b_oflags = (bp->b_oflags | BO_DELWRI) & ~BO_DONE;
+		LFS_LOCK_BUF(bp);
+		bp->b_flags &= ~B_READ;
+		bp->b_error = 0;
+		reassignbuf(bp, bp->b_vp);
+		mutex_exit(vp->v_interlock);
+	} else {
+		mutex_enter(&bufcache_lock);
+	}
+
+	if (bp->b_iodone != NULL)
+		bp->b_cflags &= ~BC_BUSY;
+	else
+		brelsel(bp, 0);
+	mutex_exit(&bufcache_lock);
+
+	return (0);
+}
+
+/*
+ * Called and return with the lfs_lock held.
+ */
+void
+lfs_flush_fs(struct lfs *fs, int flags)
+{
+	ASSERT_NO_SEGLOCK(fs);
+	KASSERT(mutex_owned(&lfs_lock));
+	if (fs->lfs_ronly)
+		return;
+
+	if (lfs_dostats)
+		++lfs_stats.flush_invoked;
+
+	mutex_exit(&lfs_lock);
+	lfs_writer_enter(fs, "fldirop");
+	lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
+	lfs_writer_leave(fs);
+	mutex_enter(&lfs_lock);
+	fs->lfs_favail = 0; /* XXX */
+}
+
+/*
+ * This routine initiates segment writes when LFS is consuming too many
+ * resources.  Ideally the pageout daemon would be able to direct LFS
+ * more subtly.
+ * XXX We have one static count of locked buffers;
+ * XXX need to think more about the multiple filesystem case.
+ *
+ * Called and return with lfs_lock held.
+ * If fs != NULL, we hold the segment lock for fs.
+ */
+void
+lfs_flush(struct lfs *fs, int flags, int only_onefs)
+{
+	extern u_int64_t locked_fakequeue_count;
+	struct mount *mp, *nmp;
+	struct lfs *tfs;
+
+	KASSERT(mutex_owned(&lfs_lock));
+	KDASSERT(fs == NULL || !LFS_SEGLOCK_HELD(fs));
+
+	if (lfs_dostats)
+		++lfs_stats.write_exceeded;
+	/* XXX should we include SEGM_CKP here? */
+	if (lfs_writing && !(flags & SEGM_SYNC)) {
+		DLOG((DLOG_FLUSH, "lfs_flush: not flushing because another flush is active\n"));
+		return;
+	}
+	while (lfs_writing)
+		cv_wait(&lfs_writing_cv, &lfs_lock);
+	lfs_writing = 1;
+
+	mutex_exit(&lfs_lock);
+
+	if (only_onefs) {
+		KASSERT(fs != NULL);
+		if (vfs_busy(fs->lfs_ivnode->v_mount, NULL))
+			goto errout;
+		mutex_enter(&lfs_lock);
+		lfs_flush_fs(fs, flags);
+		mutex_exit(&lfs_lock);
+		vfs_unbusy(fs->lfs_ivnode->v_mount, false, NULL);
+	} else {
+		locked_fakequeue_count = 0;
+		mutex_enter(&mountlist_lock);
+		for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
+		     mp = nmp) {
+			if (vfs_busy(mp, &nmp)) {
+				DLOG((DLOG_FLUSH, "lfs_flush: fs vfs_busy\n"));
+				continue;
+			}
+			if (strncmp(&mp->mnt_stat.f_fstypename[0], MOUNT_LFS,
+			    sizeof(mp->mnt_stat.f_fstypename)) == 0) {
+				tfs = VFSTOUFS(mp)->um_lfs;
+				mutex_enter(&lfs_lock);
+				lfs_flush_fs(tfs, flags);
+				mutex_exit(&lfs_lock);
+			}
+			vfs_unbusy(mp, false, &nmp);
+		}
+		mutex_exit(&mountlist_lock);
+	}
+	LFS_DEBUG_COUNTLOCKED("flush");
+	wakeup(&lfs_subsys_pages);
+
+    errout:
+	mutex_enter(&lfs_lock);
+	KASSERT(lfs_writing);
+	lfs_writing = 0;
+	wakeup(&lfs_writing);
+}
+
+#define INOCOUNT(fs) howmany((fs)->lfs_uinodes, INOPB(fs))
+#define INOBYTES(fs) ((fs)->lfs_uinodes * sizeof (struct ufs1_dinode))
+
+/*
+ * make sure that we don't have too many locked buffers.
+ * flush buffers if needed.
+ */
+int
+lfs_check(struct vnode *vp, daddr_t blkno, int flags)
+{
+	int error;
+	struct lfs *fs;
+	struct inode *ip;
+	extern pid_t lfs_writer_daemon;
+
+	error = 0;
+	ip = VTOI(vp);
+
+	/* If out of buffers, wait on writer */
+	/* XXX KS - if it's the Ifile, we're probably the cleaner! */
+	if (ip->i_number == LFS_IFILE_INUM)
+		return 0;
+	/* If we're being called from inside a dirop, don't sleep */
+	if (ip->i_flag & IN_ADIROP)
+		return 0;
+
+	fs = ip->i_lfs;
+
+	ASSERT_NO_SEGLOCK(fs);
+
+	/*
+	 * If we would flush below, but dirops are active, sleep.
+	 * Note that a dirop cannot ever reach this code!
+	 */
+	mutex_enter(&lfs_lock);
+	while (fs->lfs_dirops > 0 &&
+	       (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
+		locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
+		lfs_subsys_pages > LFS_MAX_PAGES ||
+		fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+		lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0))
+	{
+		++fs->lfs_diropwait;
+		mtsleep(&fs->lfs_writer, PRIBIO+1, "bufdirop", 0,
+			&lfs_lock);
+		--fs->lfs_diropwait;
+	}
+
+#ifdef DEBUG
+	if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS)
+		DLOG((DLOG_FLUSH, "lfs_check: lqc = %d, max %d\n",
+		      locked_queue_count + INOCOUNT(fs), LFS_MAX_BUFS));
+	if (locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES)
+		DLOG((DLOG_FLUSH, "lfs_check: lqb = %ld, max %ld\n",
+		      locked_queue_bytes + INOBYTES(fs), LFS_MAX_BYTES));
+	if (lfs_subsys_pages > LFS_MAX_PAGES)
+		DLOG((DLOG_FLUSH, "lfs_check: lssp = %d, max %d\n",
+		      lfs_subsys_pages, LFS_MAX_PAGES));
+	if (lfs_fs_pagetrip && fs->lfs_pages > lfs_fs_pagetrip)
+		DLOG((DLOG_FLUSH, "lfs_check: fssp = %d, trip at %d\n",
+		      fs->lfs_pages, lfs_fs_pagetrip));
+	if (lfs_dirvcount > LFS_MAX_DIROP)
+		DLOG((DLOG_FLUSH, "lfs_check: ldvc = %d, max %d\n",
+		      lfs_dirvcount, LFS_MAX_DIROP));
+	if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs))
+		DLOG((DLOG_FLUSH, "lfs_check: lfdvc = %d, max %d\n",
+		      fs->lfs_dirvcount, LFS_MAX_FSDIROP(fs)));
+	if (fs->lfs_diropwait > 0)
+		DLOG((DLOG_FLUSH, "lfs_check: ldvw = %d\n",
+		      fs->lfs_diropwait));
+#endif
+
+	/* If there are too many pending dirops, we have to flush them. */
+	if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+	    lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) {
+		flags |= SEGM_CKP;
+	}
+
+	if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
+	    locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
+	    lfs_subsys_pages > LFS_MAX_PAGES ||
+	    fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+	    lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) {
+		lfs_flush(fs, flags, 0);
+	} else if (lfs_fs_pagetrip && fs->lfs_pages > lfs_fs_pagetrip) {
+		/*
+		 * If we didn't flush the whole thing, some filesystems
+		 * still might want to be flushed.
+		 */
+		++fs->lfs_pdflush;
+		wakeup(&lfs_writer_daemon);
+	}
+
+	while (locked_queue_count + INOCOUNT(fs) >= LFS_WAIT_BUFS ||
+		locked_queue_bytes + INOBYTES(fs) >= LFS_WAIT_BYTES ||
+		lfs_subsys_pages > LFS_WAIT_PAGES ||
+		fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+		lfs_dirvcount > LFS_MAX_DIROP) {
+
+		if (lfs_dostats)
+			++lfs_stats.wait_exceeded;
+		DLOG((DLOG_AVAIL, "lfs_check: waiting: count=%d, bytes=%ld\n",
+		      locked_queue_count, locked_queue_bytes));
+		error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock,
+		    hz * LFS_BUFWAIT);
+		if (error != EWOULDBLOCK)
+			break;
+
+		/*
+		 * lfs_flush might not flush all the buffers, if some of the
+		 * inodes were locked or if most of them were Ifile blocks
+		 * and we weren't asked to checkpoint.	Try flushing again
+		 * to keep us from blocking indefinitely.
+		 */
+		if (locked_queue_count + INOCOUNT(fs) >= LFS_MAX_BUFS ||
+		    locked_queue_bytes + INOBYTES(fs) >= LFS_MAX_BYTES) {
+			lfs_flush(fs, flags | SEGM_CKP, 0);
+		}
+	}
+	mutex_exit(&lfs_lock);
+	return (error);
+}
+
+/*
+ * Allocate a new buffer header.
+ */
+struct buf *
+lfs_newbuf(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size, int type)
+{
+	struct buf *bp;
+	size_t nbytes;
+
+	ASSERT_MAYBE_SEGLOCK(fs);
+	nbytes = roundup(size, fsbtob(fs, 1));
+
+	bp = getiobuf(NULL, true);
+	if (nbytes) {
+		bp->b_data = lfs_malloc(fs, nbytes, type);
+		/* memset(bp->b_data, 0, nbytes); */
+	}
+#ifdef DIAGNOSTIC
+	if (vp == NULL)
+		panic("vp is NULL in lfs_newbuf");
+	if (bp == NULL)
+		panic("bp is NULL after malloc in lfs_newbuf");
+#endif
+
+	bp->b_bufsize = size;
+	bp->b_bcount = size;
+	bp->b_lblkno = daddr;
+	bp->b_blkno = daddr;
+	bp->b_error = 0;
+	bp->b_resid = 0;
+	bp->b_iodone = lfs_callback;
+	bp->b_cflags = BC_BUSY | BC_NOCACHE;
+	bp->b_private = fs;
+
+	mutex_enter(&bufcache_lock);
+	mutex_enter(vp->v_interlock);
+	bgetvp(vp, bp);
+	mutex_exit(vp->v_interlock);
+	mutex_exit(&bufcache_lock);
+
+	return (bp);
+}
+
+void
+lfs_freebuf(struct lfs *fs, struct buf *bp)
+{
+	struct vnode *vp;
+
+	if ((vp = bp->b_vp) != NULL) {
+		mutex_enter(&bufcache_lock);
+		mutex_enter(vp->v_interlock);
+		brelvp(bp);
+		mutex_exit(vp->v_interlock);
+		mutex_exit(&bufcache_lock);
+	}
+	if (!(bp->b_cflags & BC_INVAL)) { /* BC_INVAL indicates a "fake" buffer */
+		lfs_free(fs, bp->b_data, LFS_NB_UNKNOWN);
+		bp->b_data = NULL;
+	}
+	putiobuf(bp);
+}
+
+/*
+ * Count buffers on the "locked" queue, and compare it to a pro-forma count.
+ * Don't count malloced buffers, since they don't detract from the total.
+ */
+void
+lfs_countlocked(int *count, long *bytes, const char *msg)
+{
+	struct buf *bp;
+	int n = 0;
+	long int size = 0L;
+
+	mutex_enter(&bufcache_lock);
+	TAILQ_FOREACH(bp, &bufqueues[BQ_LOCKED].bq_queue, b_freelist) {
+		KASSERT(bp->b_iodone == NULL);
+		n++;
+		size += bp->b_bufsize;
+#ifdef DIAGNOSTIC
+		if (n > nbuf)
+			panic("lfs_countlocked: this can't happen: more"
+			      " buffers locked than exist");
+#endif
+	}
+	/*
+	 * Theoretically this function never really does anything.
+	 * Give a warning if we have to fix the accounting.
+	 */
+	if (n != *count) {
+		DLOG((DLOG_LLIST, "lfs_countlocked: %s: adjusted buf count"
+		      " from %d to %d\n", msg, *count, n));
+	}
+	if (size != *bytes) {
+		DLOG((DLOG_LLIST, "lfs_countlocked: %s: adjusted byte count"
+		      " from %ld to %ld\n", msg, *bytes, size));
+	}
+	*count = n;
+	*bytes = size;
+	mutex_exit(&bufcache_lock);
+	return;
+}
+
+int
+lfs_wait_pages(void)
+{
+	int active, inactive;
+
+	uvm_estimatepageable(&active, &inactive);
+	return LFS_WAIT_RESOURCE(active + inactive + uvmexp.free, 1);
+}
+
+int
+lfs_max_pages(void)
+{
+	int active, inactive;
+
+	uvm_estimatepageable(&active, &inactive);
+	return LFS_MAX_RESOURCE(active + inactive + uvmexp.free, 1);
+}
diff --git a/sys/ufs/lfs/lfs_cksum.c b/sys/ufs/lfs/lfs_cksum.c
new file mode 100644
index 000000000..a3f0fb93d
--- /dev/null
+++ b/sys/ufs/lfs/lfs_cksum.c
@@ -0,0 +1,110 @@
+/*	$NetBSD: lfs_cksum.c,v 1.27 2008/04/28 20:24:11 martin Exp $	*/
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)lfs_cksum.c	8.2 (Berkeley) 10/9/94
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_cksum.c,v 1.27 2008/04/28 20:24:11 martin Exp $");
+
+#include <sys/param.h>
+#ifdef _KERNEL
+# include <sys/systm.h>
+# include <sys/lock.h>
+#else
+# include <stddef.h>
+#endif
+#include <sys/mount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+/*
+ * Simple, general purpose, fast checksum.  Data must be short-aligned.
+ * Returns a u_long in case we ever want to do something more rigorous.
+ *
+ * XXX
+ * Use the TCP/IP checksum instead.
+ */
+u_int32_t
+lfs_cksum_part(void *str, size_t len, u_int32_t sum)
+{
+
+	len &= ~(sizeof(u_int16_t) - 1);
+	for (; len; len -= sizeof(u_int16_t)) {
+		sum ^= *(u_int16_t *)str;
+		str = (void *)((u_int16_t *)str + 1);
+	}
+	return (sum);
+}
+
+u_int32_t
+cksum(void *str, size_t len)
+{
+
+	return lfs_cksum_fold(lfs_cksum_part(str, len, 0));
+}
+
+u_int32_t
+lfs_sb_cksum(struct dlfs *fs)
+{
+	size_t size;
+
+	size = (size_t)offsetof(struct dlfs, dlfs_cksum);
+	return cksum(fs, size);
+}
diff --git a/sys/ufs/lfs/lfs_debug.c b/sys/ufs/lfs/lfs_debug.c
new file mode 100644
index 000000000..ecad77204
--- /dev/null
+++ b/sys/ufs/lfs/lfs_debug.c
@@ -0,0 +1,325 @@
+/*	$NetBSD: lfs_debug.c,v 1.39 2011/07/17 20:54:54 joerg Exp $	*/
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)lfs_debug.c	8.1 (Berkeley) 6/11/93
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_debug.c,v 1.39 2011/07/17 20:54:54 joerg Exp $");
+
+#ifdef DEBUG
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+#include <sys/syslog.h>
+#include <sys/proc.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+int lfs_lognum;
+struct lfs_log_entry lfs_log[LFS_LOGLENGTH];
+
+int
+lfs_bwrite_log(struct buf *bp, const char *file, int line)
+{
+	struct vop_bwrite_args a;
+
+	a.a_desc = VDESC(vop_bwrite);
+	a.a_bp = bp;
+
+	if (!(bp->b_flags & B_GATHERED) && !(bp->b_oflags & BO_DELWRI)) {
+		LFS_ENTER_LOG("write", file, line, bp->b_lblkno, bp->b_flags,
+			curproc->p_pid);
+	}
+	return (VCALL(bp->b_vp, VOFFSET(vop_bwrite), &a));
+}
+
+void
+lfs_dumplog(void)
+{
+	int i;
+	const char *cp;
+
+	for (i = lfs_lognum; i != (lfs_lognum - 1) % LFS_LOGLENGTH;
+	     i = (i + 1) % LFS_LOGLENGTH)
+		if (lfs_log[i].file) {
+			/* Only print out basename, for readability */
+			cp = lfs_log[i].file;
+			while(*cp)
+				++cp;
+			while(*cp != '/' && cp > lfs_log[i].file)
+				--cp;
+
+			printf("lbn %" PRId64 " %s %lx %d, %d %s\n",
+				lfs_log[i].block,
+				lfs_log[i].op,
+				lfs_log[i].flags,
+				lfs_log[i].pid,
+				lfs_log[i].line,
+				cp);
+		}
+}
+
+void
+lfs_dump_super(struct lfs *lfsp)
+{
+	int i;
+
+	printf("%s%x\t%s%x\t%s%d\t%s%d\n",
+	       "magic	 ", lfsp->lfs_magic,
+	       "version	 ", lfsp->lfs_version,
+	       "size	 ", lfsp->lfs_size,
+	       "ssize	 ", lfsp->lfs_ssize);
+	printf("%s%d\t%s%d\t%s%d\t%s%d\n",
+	       "dsize	 ", lfsp->lfs_dsize,
+	       "bsize	 ", lfsp->lfs_bsize,
+	       "fsize	 ", lfsp->lfs_fsize,
+	       "frag	 ", lfsp->lfs_frag);
+
+	printf("%s%d\t%s%d\t%s%d\t%s%d\n",
+	       "minfree	 ", lfsp->lfs_minfree,
+	       "inopb	 ", lfsp->lfs_inopb,
+	       "ifpb	 ", lfsp->lfs_ifpb,
+	       "nindir	 ", lfsp->lfs_nindir);
+
+	printf("%s%d\t%s%d\t%s%d\t%s%d\n",
+	       "nseg	 ", lfsp->lfs_nseg,
+	       "nspf	 ", lfsp->lfs_nspf,
+	       "cleansz	 ", lfsp->lfs_cleansz,
+	       "segtabsz ", lfsp->lfs_segtabsz);
+
+	printf("%s%x\t%s%d\t%s%lx\t%s%d\n",
+	       "segmask	 ", lfsp->lfs_segmask,
+	       "segshift ", lfsp->lfs_segshift,
+	       "bmask	 ", (unsigned long)lfsp->lfs_bmask,
+	       "bshift	 ", lfsp->lfs_bshift);
+
+	printf("%s%lu\t%s%d\t%s%lx\t%s%u\n",
+	       "ffmask	 ", (unsigned long)lfsp->lfs_ffmask,
+	       "ffshift	 ", lfsp->lfs_ffshift,
+	       "fbmask	 ", (unsigned long)lfsp->lfs_fbmask,
+	       "fbshift	 ", lfsp->lfs_fbshift);
+
+	printf("%s%d\t%s%d\t%s%x\t%s%qx\n",
+	       "sushift	 ", lfsp->lfs_sushift,
+	       "fsbtodb	 ", lfsp->lfs_fsbtodb,
+	       "cksum	 ", lfsp->lfs_cksum,
+	       "maxfilesize ", (long long)lfsp->lfs_maxfilesize);
+
+	printf("Superblock disk addresses:");
+	for (i = 0; i < LFS_MAXNUMSB; i++)
+		printf(" %x", lfsp->lfs_sboffs[i]);
+	printf("\n");
+
+	printf("Checkpoint Info\n");
+	printf("%s%d\t%s%x\t%s%d\n",
+	       "freehd	 ", lfsp->lfs_freehd,
+	       "idaddr	 ", lfsp->lfs_idaddr,
+	       "ifile	 ", lfsp->lfs_ifile);
+	printf("%s%x\t%s%d\t%s%x\t%s%x\t%s%x\t%s%x\n",
+	       "bfree	 ", lfsp->lfs_bfree,
+	       "nfiles	 ", lfsp->lfs_nfiles,
+	       "lastseg	 ", lfsp->lfs_lastseg,
+	       "nextseg	 ", lfsp->lfs_nextseg,
+	       "curseg	 ", lfsp->lfs_curseg,
+	       "offset	 ", lfsp->lfs_offset);
+	printf("tstamp	 %llx\n", (long long)lfsp->lfs_tstamp);
+}
+
+void
+lfs_dump_dinode(struct ufs1_dinode *dip)
+{
+	int i;
+
+	printf("%s%u\t%s%d\t%s%u\t%s%u\t%s%qu\t%s%d\n",
+	       "mode   ", dip->di_mode,
+	       "nlink  ", dip->di_nlink,
+	       "uid    ", dip->di_uid,
+	       "gid    ", dip->di_gid,
+	       "size   ", (long long)dip->di_size,
+	       "blocks ", dip->di_blocks);
+	printf("inum  %d\n", dip->di_inumber);
+	printf("Direct Addresses\n");
+	for (i = 0; i < NDADDR; i++) {
+		printf("\t%x", dip->di_db[i]);
+		if ((i % 6) == 5)
+			printf("\n");
+	}
+	for (i = 0; i < NIADDR; i++)
+		printf("\t%x", dip->di_ib[i]);
+	printf("\n");
+}
+
+void
+lfs_check_segsum(struct lfs *fs, struct segment *sp, char *file, int line)
+{
+	int actual;
+#if 0
+	static int offset;
+#endif
+
+	if ((actual = 1) == 1)
+		return; /* XXXX not checking this anymore, really */
+
+	if (sp->sum_bytes_left >= FINFOSIZE
+	   && sp->fip->fi_nblocks > 512) {
+		printf("%s:%d: fi_nblocks = %d\n",file,line,sp->fip->fi_nblocks);
+#ifdef DDB
+		Debugger();
+#endif
+	}
+
+	if (sp->sum_bytes_left > 484) {
+		printf("%s:%d: bad value (%d = -%d) for sum_bytes_left\n",
+		       file, line, sp->sum_bytes_left, fs->lfs_sumsize-sp->sum_bytes_left);
+		panic("too many bytes");
+	}
+
+	actual = fs->lfs_sumsize
+		/* amount taken up by FINFOs */
+		- ((char *)&(sp->fip->fi_blocks[sp->fip->fi_nblocks]) - (char *)(sp->segsum))
+			/* amount taken up by inode blocks */
+			- sizeof(int32_t)*((sp->ninodes+INOPB(fs)-1) / INOPB(fs));
+#if 0
+	if (actual - sp->sum_bytes_left < offset)
+	{
+		printf("%s:%d: offset changed %d -> %d\n", file, line,
+		       offset, actual-sp->sum_bytes_left);
+		offset = actual - sp->sum_bytes_left;
+		/* panic("byte mismatch"); */
+	}
+#endif
+#if 0
+	if (actual != sp->sum_bytes_left)
+		printf("%s:%d: warning: segsum miscalc at %d (-%d => %d)\n",
+		       file, line, sp->sum_bytes_left,
+		       fs->lfs_sumsize-sp->sum_bytes_left,
+		       actual);
+#endif
+	if (sp->sum_bytes_left > 0
+	   && ((char *)(sp->segsum))[fs->lfs_sumsize
+				     - sizeof(int32_t) * ((sp->ninodes+INOPB(fs)-1) / INOPB(fs))
+				     - sp->sum_bytes_left] != '\0') {
+		printf("%s:%d: warning: segsum overwrite at %d (-%d => %d)\n",
+		       file, line, sp->sum_bytes_left,
+		       fs->lfs_sumsize-sp->sum_bytes_left,
+		       actual);
+#ifdef DDB
+		Debugger();
+#endif
+	}
+}
+
+void
+lfs_check_bpp(struct lfs *fs, struct segment *sp, char *file, int line)
+{
+	daddr_t blkno;
+	struct buf **bpp;
+	struct vnode *devvp;
+
+	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+	blkno = (*(sp->bpp))->b_blkno;
+	for (bpp = sp->bpp; bpp < sp->cbpp; bpp++) {
+		if ((*bpp)->b_blkno != blkno) {
+			if ((*bpp)->b_vp == devvp) {
+				printf("Oops, would misplace raw block "
+				       "0x%" PRIx64 " at 0x%" PRIx64 "\n",
+				       (*bpp)->b_blkno,
+				       blkno);
+			} else {
+				printf("%s:%d: misplace ino %llu lbn %" PRId64
+				       " at 0x%" PRIx64 " instead of "
+				       "0x%" PRIx64 "\n",
+				       file, line,
+				       (unsigned long long)
+				       VTOI((*bpp)->b_vp)->i_number,
+				       (*bpp)->b_lblkno,
+				       blkno,
+				       (*bpp)->b_blkno);
+			}
+		}
+		blkno += fsbtodb(fs, btofsb(fs, (*bpp)->b_bcount));
+	}
+}
+
+int lfs_debug_log_subsys[DLOG_MAX];
+
+/*
+ * Log events from various debugging areas of LFS, depending on what
+ * the user has enabled.
+ */
+void
+lfs_debug_log(int subsys, const char *fmt, ...)
+{
+	va_list ap;
+
+	/* If not debugging this subsys, exit */
+	if (lfs_debug_log_subsys[subsys] == 0)
+		return;
+
+	va_start(ap, fmt);
+	vlog(LOG_DEBUG, fmt, ap);
+	va_end(ap);
+}
+#endif /* DEBUG */
diff --git a/include/ufs/lfs/lfs_extern.h b/sys/ufs/lfs/lfs_extern.h
similarity index 100%
rename from include/ufs/lfs/lfs_extern.h
rename to sys/ufs/lfs/lfs_extern.h
diff --git a/sys/ufs/lfs/lfs_inode.c b/sys/ufs/lfs/lfs_inode.c
new file mode 100644
index 000000000..06bb9c193
--- /dev/null
+++ b/sys/ufs/lfs/lfs_inode.c
@@ -0,0 +1,902 @@
+/*	$NetBSD: lfs_inode.c,v 1.126 2011/11/23 19:42:10 bouyer Exp $	*/
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)lfs_inode.c	8.9 (Berkeley) 5/8/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_inode.c,v 1.126 2011/11/23 19:42:10 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/file.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/kernel.h>
+#include <sys/trace.h>
+#include <sys/resourcevar.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+static int lfs_update_seguse(struct lfs *, struct inode *ip, long, size_t);
+static int lfs_indirtrunc (struct inode *, daddr_t, daddr_t,
+			   daddr_t, int, long *, long *, long *, size_t *);
+static int lfs_blkfree (struct lfs *, struct inode *, daddr_t, size_t, long *, size_t *);
+static int lfs_vtruncbuf(struct vnode *, daddr_t, bool, int);
+
+/* Search a block for a specific dinode. */
+struct ufs1_dinode *
+lfs_ifind(struct lfs *fs, ino_t ino, struct buf *bp)
+{
+	struct ufs1_dinode *dip = (struct ufs1_dinode *)bp->b_data;
+	struct ufs1_dinode *ldip, *fin;
+
+	ASSERT_NO_SEGLOCK(fs);
+	/*
+	 * Read the inode block backwards, since later versions of the
+	 * inode will supercede earlier ones.  Though it is unlikely, it is
+	 * possible that the same inode will appear in the same inode block.
+	 */
+	fin = dip + INOPB(fs);
+	for (ldip = fin - 1; ldip >= dip; --ldip)
+		if (ldip->di_inumber == ino)
+			return (ldip);
+
+	printf("searched %d entries\n", (int)(fin - dip));
+	printf("offset is 0x%x (seg %d)\n", fs->lfs_offset,
+	       dtosn(fs, fs->lfs_offset));
+	printf("block is 0x%llx (seg %lld)\n",
+	       (unsigned long long)dbtofsb(fs, bp->b_blkno),
+	       (long long)dtosn(fs, dbtofsb(fs, bp->b_blkno)));
+
+	return NULL;
+}
+
+int
+lfs_update(struct vnode *vp, const struct timespec *acc,
+    const struct timespec *mod, int updflags)
+{
+	struct inode *ip;
+	struct lfs *fs = VFSTOUFS(vp->v_mount)->um_lfs;
+	int flags;
+
+	ASSERT_NO_SEGLOCK(fs);
+	if (vp->v_mount->mnt_flag & MNT_RDONLY)
+		return (0);
+	ip = VTOI(vp);
+
+	/*
+	 * If we are called from vinvalbuf, and the file's blocks have
+	 * already been scheduled for writing, but the writes have not
+	 * yet completed, lfs_vflush will not be called, and vinvalbuf
+	 * will cause a panic.	So, we must wait until any pending write
+	 * for our inode completes, if we are called with UPDATE_WAIT set.
+	 */
+	mutex_enter(vp->v_interlock);
+	while ((updflags & (UPDATE_WAIT|UPDATE_DIROP)) == UPDATE_WAIT &&
+	    WRITEINPROG(vp)) {
+		DLOG((DLOG_SEG, "lfs_update: sleeping on ino %d"
+		      " (in progress)\n", ip->i_number));
+		cv_wait(&vp->v_cv, vp->v_interlock);
+	}
+	mutex_exit(vp->v_interlock);
+	LFS_ITIMES(ip, acc, mod, NULL);
+	if (updflags & UPDATE_CLOSE)
+		flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED | IN_CLEANING);
+	else
+		flags = ip->i_flag & (IN_MODIFIED | IN_CLEANING);
+	if (flags == 0)
+		return (0);
+
+	/* If sync, push back the vnode and any dirty blocks it may have. */
+	if ((updflags & (UPDATE_WAIT|UPDATE_DIROP)) == UPDATE_WAIT) {
+		/* Avoid flushing VU_DIROP. */
+		mutex_enter(&lfs_lock);
+		++fs->lfs_diropwait;
+		while (vp->v_uflag & VU_DIROP) {
+			DLOG((DLOG_DIROP, "lfs_update: sleeping on inode %d"
+			      " (dirops)\n", ip->i_number));
+			DLOG((DLOG_DIROP, "lfs_update: vflags 0x%x, iflags"
+			      " 0x%x\n",
+			      vp->v_iflag | vp->v_vflag | vp->v_uflag,
+			      ip->i_flag));
+			if (fs->lfs_dirops == 0)
+				lfs_flush_fs(fs, SEGM_SYNC);
+			else
+				mtsleep(&fs->lfs_writer, PRIBIO+1, "lfs_fsync",
+					0, &lfs_lock);
+			/* XXX KS - by falling out here, are we writing the vn
+			twice? */
+		}
+		--fs->lfs_diropwait;
+		mutex_exit(&lfs_lock);
+		return lfs_vflush(vp);
+	}
+	return 0;
+}
+
+#define	SINGLE	0	/* index of single indirect block */
+#define	DOUBLE	1	/* index of double indirect block */
+#define	TRIPLE	2	/* index of triple indirect block */
+/*
+ * Truncate the inode oip to at most length size, freeing the
+ * disk blocks.
+ */
+/* VOP_BWRITE 1 + NIADDR + lfs_balloc == 2 + 2*NIADDR times */
+
+int
+lfs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred)
+{
+	daddr_t lastblock;
+	struct inode *oip = VTOI(ovp);
+	daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR];
+	/* XXX ondisk32 */
+	int32_t newblks[NDADDR + NIADDR];
+	struct lfs *fs;
+	struct buf *bp;
+	int offset, size, level;
+	long count, rcount, blocksreleased = 0, real_released = 0;
+	int i, nblocks;
+	int aflags, error, allerror = 0;
+	off_t osize;
+	long lastseg;
+	size_t bc;
+	int obufsize, odb;
+	int usepc;
+	struct ufsmount *ump = oip->i_ump;
+
+	if (ovp->v_type == VCHR || ovp->v_type == VBLK ||
+	    ovp->v_type == VFIFO || ovp->v_type == VSOCK) {
+		KASSERT(oip->i_size == 0);
+		return 0;
+	}
+
+	if (length < 0)
+		return (EINVAL);
+
+	/*
+	 * Just return and not update modification times.
+	 */
+	if (oip->i_size == length) {
+		/* still do a uvm_vnp_setsize() as writesize may be larger */
+		uvm_vnp_setsize(ovp, length);
+		return (0);
+	}
+
+	if (ovp->v_type == VLNK &&
+	    (oip->i_size < ump->um_maxsymlinklen ||
+	     (ump->um_maxsymlinklen == 0 &&
+	      oip->i_ffs1_blocks == 0))) {
+#ifdef DIAGNOSTIC
+		if (length != 0)
+			panic("lfs_truncate: partial truncate of symlink");
+#endif
+		memset((char *)SHORTLINK(oip), 0, (u_int)oip->i_size);
+		oip->i_size = oip->i_ffs1_size = 0;
+		oip->i_flag |= IN_CHANGE | IN_UPDATE;
+		return (lfs_update(ovp, NULL, NULL, 0));
+	}
+	if (oip->i_size == length) {
+		oip->i_flag |= IN_CHANGE | IN_UPDATE;
+		return (lfs_update(ovp, NULL, NULL, 0));
+	}
+	fs = oip->i_lfs;
+	lfs_imtime(fs);
+	osize = oip->i_size;
+	usepc = (ovp->v_type == VREG && ovp != fs->lfs_ivnode);
+
+	ASSERT_NO_SEGLOCK(fs);
+	/*
+	 * Lengthen the size of the file. We must ensure that the
+	 * last byte of the file is allocated. Since the smallest
+	 * value of osize is 0, length will be at least 1.
+	 */
+	if (osize < length) {
+		if (length > ump->um_maxfilesize)
+			return (EFBIG);
+		aflags = B_CLRBUF;
+		if (ioflag & IO_SYNC)
+			aflags |= B_SYNC;
+		if (usepc) {
+			if (lblkno(fs, osize) < NDADDR &&
+			    lblkno(fs, osize) != lblkno(fs, length) &&
+			    blkroundup(fs, osize) != osize) {
+				off_t eob;
+
+				eob = blkroundup(fs, osize);
+				uvm_vnp_setwritesize(ovp, eob);
+				error = ufs_balloc_range(ovp, osize,
+				    eob - osize, cred, aflags);
+				if (error) {
+					(void) lfs_truncate(ovp, osize,
+						    ioflag & IO_SYNC, cred);
+					return error;
+				}
+				if (ioflag & IO_SYNC) {
+					mutex_enter(ovp->v_interlock);
+					VOP_PUTPAGES(ovp,
+					    trunc_page(osize & fs->lfs_bmask),
+					    round_page(eob),
+					    PGO_CLEANIT | PGO_SYNCIO);
+				}
+			}
+			uvm_vnp_setwritesize(ovp, length);
+			error = ufs_balloc_range(ovp, length - 1, 1, cred,
+						 aflags);
+			if (error) {
+				(void) lfs_truncate(ovp, osize,
+						    ioflag & IO_SYNC, cred);
+				return error;
+			}
+			uvm_vnp_setsize(ovp, length);
+			oip->i_flag |= IN_CHANGE | IN_UPDATE;
+			KASSERT(ovp->v_size == oip->i_size);
+			oip->i_lfs_hiblk = lblkno(fs, oip->i_size + fs->lfs_bsize - 1) - 1;
+			return (lfs_update(ovp, NULL, NULL, 0));
+		} else {
+			error = lfs_reserve(fs, ovp, NULL,
+			    btofsb(fs, (NIADDR + 2) << fs->lfs_bshift));
+			if (error)
+				return (error);
+			error = lfs_balloc(ovp, length - 1, 1, cred,
+					   aflags, &bp);
+			lfs_reserve(fs, ovp, NULL,
+			    -btofsb(fs, (NIADDR + 2) << fs->lfs_bshift));
+			if (error)
+				return (error);
+			oip->i_ffs1_size = oip->i_size = length;
+			uvm_vnp_setsize(ovp, length);
+			(void) VOP_BWRITE(bp->b_vp, bp);
+			oip->i_flag |= IN_CHANGE | IN_UPDATE;
+			oip->i_lfs_hiblk = lblkno(fs, oip->i_size + fs->lfs_bsize - 1) - 1;
+			return (lfs_update(ovp, NULL, NULL, 0));
+		}
+	}
+
+	if ((error = lfs_reserve(fs, ovp, NULL,
+	    btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift))) != 0)
+		return (error);
+
+	/*
+	 * Shorten the size of the file. If the file is not being
+	 * truncated to a block boundary, the contents of the
+	 * partial block following the end of the file must be
+	 * zero'ed in case it ever becomes accessible again because
+	 * of subsequent file growth. Directories however are not
+	 * zero'ed as they should grow back initialized to empty.
+	 */
+	offset = blkoff(fs, length);
+	lastseg = -1;
+	bc = 0;
+
+	if (ovp != fs->lfs_ivnode)
+		lfs_seglock(fs, SEGM_PROT);
+	if (offset == 0) {
+		oip->i_size = oip->i_ffs1_size = length;
+	} else if (!usepc) {
+		lbn = lblkno(fs, length);
+		aflags = B_CLRBUF;
+		if (ioflag & IO_SYNC)
+			aflags |= B_SYNC;
+		error = lfs_balloc(ovp, length - 1, 1, cred, aflags, &bp);
+		if (error) {
+			lfs_reserve(fs, ovp, NULL,
+			    -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
+			goto errout;
+		}
+		obufsize = bp->b_bufsize;
+		odb = btofsb(fs, bp->b_bcount);
+		oip->i_size = oip->i_ffs1_size = length;
+		size = blksize(fs, oip, lbn);
+		if (ovp->v_type != VDIR)
+			memset((char *)bp->b_data + offset, 0,
+			       (u_int)(size - offset));
+		allocbuf(bp, size, 1);
+		if ((bp->b_flags & B_LOCKED) != 0 && bp->b_iodone == NULL) {
+			mutex_enter(&lfs_lock);
+			locked_queue_bytes -= obufsize - bp->b_bufsize;
+			mutex_exit(&lfs_lock);
+		}
+		if (bp->b_oflags & BO_DELWRI)
+			fs->lfs_avail += odb - btofsb(fs, size);
+		(void) VOP_BWRITE(bp->b_vp, bp);
+	} else { /* vp->v_type == VREG && length < osize && offset != 0 */
+		/*
+		 * When truncating a regular file down to a non-block-aligned
+		 * size, we must zero the part of last block which is past
+		 * the new EOF.  We must synchronously flush the zeroed pages
+		 * to disk since the new pages will be invalidated as soon
+		 * as we inform the VM system of the new, smaller size.
+		 * We must do this before acquiring the GLOCK, since fetching
+		 * the pages will acquire the GLOCK internally.
+		 * So there is a window where another thread could see a whole
+		 * zeroed page past EOF, but that's life.
+		 */
+		daddr_t xlbn;
+		voff_t eoz;
+
+		aflags = ioflag & IO_SYNC ? B_SYNC : 0;
+		error = ufs_balloc_range(ovp, length - 1, 1, cred, aflags);
+		if (error) {
+			lfs_reserve(fs, ovp, NULL,
+				    -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
+			goto errout;
+		}
+		xlbn = lblkno(fs, length);
+		size = blksize(fs, oip, xlbn);
+		eoz = MIN(lblktosize(fs, xlbn) + size, osize);
+		ubc_zerorange(&ovp->v_uobj, length, eoz - length,
+		    UBC_UNMAP_FLAG(ovp));
+		if (round_page(eoz) > round_page(length)) {
+			mutex_enter(ovp->v_interlock);
+			error = VOP_PUTPAGES(ovp, round_page(length),
+			    round_page(eoz),
+			    PGO_CLEANIT | PGO_DEACTIVATE |
+			    ((ioflag & IO_SYNC) ? PGO_SYNCIO : 0));
+			if (error) {
+				lfs_reserve(fs, ovp, NULL,
+					    -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
+				goto errout;
+			}
+		}
+	}
+
+	genfs_node_wrlock(ovp);
+
+	oip->i_size = oip->i_ffs1_size = length;
+	uvm_vnp_setsize(ovp, length);
+
+	/*
+	 * Calculate index into inode's block list of
+	 * last direct and indirect blocks (if any)
+	 * which we want to keep.  Lastblock is -1 when
+	 * the file is truncated to 0.
+	 */
+	/* Avoid sign overflow - XXX assumes that off_t is a quad_t. */
+	if (length > QUAD_MAX - fs->lfs_bsize)
+		lastblock = lblkno(fs, QUAD_MAX - fs->lfs_bsize);
+	else
+		lastblock = lblkno(fs, length + fs->lfs_bsize - 1) - 1;
+	lastiblock[SINGLE] = lastblock - NDADDR;
+	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
+	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
+	nblocks = btofsb(fs, fs->lfs_bsize);
+	/*
+	 * Record changed file and block pointers before we start
+	 * freeing blocks.  lastiblock values are also normalized to -1
+	 * for calls to lfs_indirtrunc below.
+	 */
+	memcpy((void *)newblks, (void *)&oip->i_ffs1_db[0], sizeof newblks);
+	for (level = TRIPLE; level >= SINGLE; level--)
+		if (lastiblock[level] < 0) {
+			newblks[NDADDR+level] = 0;
+			lastiblock[level] = -1;
+		}
+	for (i = NDADDR - 1; i > lastblock; i--)
+		newblks[i] = 0;
+
+	oip->i_size = oip->i_ffs1_size = osize;
+	error = lfs_vtruncbuf(ovp, lastblock + 1, false, 0);
+	if (error && !allerror)
+		allerror = error;
+
+	/*
+	 * Indirect blocks first.
+	 */
+	indir_lbn[SINGLE] = -NDADDR;
+	indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1;
+	indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
+	for (level = TRIPLE; level >= SINGLE; level--) {
+		bn = oip->i_ffs1_ib[level];
+		if (bn != 0) {
+			error = lfs_indirtrunc(oip, indir_lbn[level],
+					       bn, lastiblock[level],
+					       level, &count, &rcount,
+					       &lastseg, &bc);
+			if (error)
+				allerror = error;
+			real_released += rcount;
+			blocksreleased += count;
+			if (lastiblock[level] < 0) {
+				if (oip->i_ffs1_ib[level] > 0)
+					real_released += nblocks;
+				blocksreleased += nblocks;
+				oip->i_ffs1_ib[level] = 0;
+				lfs_blkfree(fs, oip, bn, fs->lfs_bsize,
+					    &lastseg, &bc);
+        			lfs_deregister_block(ovp, bn);
+			}
+		}
+		if (lastiblock[level] >= 0)
+			goto done;
+	}
+
+	/*
+	 * All whole direct blocks or frags.
+	 */
+	for (i = NDADDR - 1; i > lastblock; i--) {
+		long bsize, obsize;
+
+		bn = oip->i_ffs1_db[i];
+		if (bn == 0)
+			continue;
+		bsize = blksize(fs, oip, i);
+		if (oip->i_ffs1_db[i] > 0) {
+			/* Check for fragment size changes */
+			obsize = oip->i_lfs_fragsize[i];
+			real_released += btofsb(fs, obsize);
+			oip->i_lfs_fragsize[i] = 0;
+		} else
+			obsize = 0;
+		blocksreleased += btofsb(fs, bsize);
+		oip->i_ffs1_db[i] = 0;
+		lfs_blkfree(fs, oip, bn, obsize, &lastseg, &bc);
+        	lfs_deregister_block(ovp, bn);
+	}
+	if (lastblock < 0)
+		goto done;
+
+	/*
+	 * Finally, look for a change in size of the
+	 * last direct block; release any frags.
+	 */
+	bn = oip->i_ffs1_db[lastblock];
+	if (bn != 0) {
+		long oldspace, newspace;
+#if 0
+		long olddspace;
+#endif
+
+		/*
+		 * Calculate amount of space we're giving
+		 * back as old block size minus new block size.
+		 */
+		oldspace = blksize(fs, oip, lastblock);
+#if 0
+		olddspace = oip->i_lfs_fragsize[lastblock];
+#endif
+
+		oip->i_size = oip->i_ffs1_size = length;
+		newspace = blksize(fs, oip, lastblock);
+		if (newspace == 0)
+			panic("itrunc: newspace");
+		if (oldspace - newspace > 0) {
+			blocksreleased += btofsb(fs, oldspace - newspace);
+		}
+#if 0
+		if (bn > 0 && olddspace - newspace > 0) {
+			/* No segment accounting here, just vnode */
+			real_released += btofsb(fs, olddspace - newspace);
+		}
+#endif
+	}
+
+done:
+	/* Finish segment accounting corrections */
+	lfs_update_seguse(fs, oip, lastseg, bc);
+#ifdef DIAGNOSTIC
+	for (level = SINGLE; level <= TRIPLE; level++)
+		if ((newblks[NDADDR + level] == 0) !=
+		    ((oip->i_ffs1_ib[level]) == 0)) {
+			panic("lfs itrunc1");
+		}
+	for (i = 0; i < NDADDR; i++)
+		if ((newblks[i] == 0) != (oip->i_ffs1_db[i] == 0)) {
+			panic("lfs itrunc2");
+		}
+	if (length == 0 &&
+	    (!LIST_EMPTY(&ovp->v_cleanblkhd) || !LIST_EMPTY(&ovp->v_dirtyblkhd)))
+		panic("lfs itrunc3");
+#endif /* DIAGNOSTIC */
+	/*
+	 * Put back the real size.
+	 */
+	oip->i_size = oip->i_ffs1_size = length;
+	oip->i_lfs_effnblks -= blocksreleased;
+	oip->i_ffs1_blocks -= real_released;
+	mutex_enter(&lfs_lock);
+	fs->lfs_bfree += blocksreleased;
+	mutex_exit(&lfs_lock);
+#ifdef DIAGNOSTIC
+	if (oip->i_size == 0 &&
+	    (oip->i_ffs1_blocks != 0 || oip->i_lfs_effnblks != 0)) {
+		printf("lfs_truncate: truncate to 0 but %d blks/%d effblks\n",
+		       oip->i_ffs1_blocks, oip->i_lfs_effnblks);
+		panic("lfs_truncate: persistent blocks");
+	}
+#endif
+
+	/*
+	 * If we truncated to zero, take us off the paging queue.
+	 */
+	mutex_enter(&lfs_lock);
+	if (oip->i_size == 0 && oip->i_flags & IN_PAGING) {
+		oip->i_flags &= ~IN_PAGING;
+		TAILQ_REMOVE(&fs->lfs_pchainhd, oip, i_lfs_pchain);
+	}
+	mutex_exit(&lfs_lock);
+
+	oip->i_flag |= IN_CHANGE;
+#ifdef QUOTA
+	(void) chkdq(oip, -blocksreleased, NOCRED, 0);
+#endif
+	lfs_reserve(fs, ovp, NULL,
+	    -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
+	genfs_node_unlock(ovp);
+  errout:
+	oip->i_lfs_hiblk = lblkno(fs, oip->i_size + fs->lfs_bsize - 1) - 1;
+	if (ovp != fs->lfs_ivnode)
+		lfs_segunlock(fs);
+	return (allerror ? allerror : error);
+}
+
+/* Update segment and avail usage information when removing a block. */
+static int
+lfs_blkfree(struct lfs *fs, struct inode *ip, daddr_t daddr,
+	    size_t bsize, long *lastseg, size_t *num)
+{
+	long seg;
+	int error = 0;
+
+	ASSERT_SEGLOCK(fs);
+	bsize = fragroundup(fs, bsize);
+	if (daddr > 0) {
+		if (*lastseg != (seg = dtosn(fs, daddr))) {
+			error = lfs_update_seguse(fs, ip, *lastseg, *num);
+			*num = bsize;
+			*lastseg = seg;
+		} else
+			*num += bsize;
+	}
+
+	return error;
+}
+
+/* Finish the accounting updates for a segment. */
+static int
+lfs_update_seguse(struct lfs *fs, struct inode *ip, long lastseg, size_t num)
+{
+	struct segdelta *sd;
+	struct vnode *vp;
+
+	ASSERT_SEGLOCK(fs);
+	if (lastseg < 0 || num == 0)
+		return 0;
+
+	vp = ITOV(ip);
+	LIST_FOREACH(sd, &ip->i_lfs_segdhd, list)
+		if (sd->segnum == lastseg)
+			break;
+	if (sd == NULL) {
+		sd = malloc(sizeof(*sd), M_SEGMENT, M_WAITOK);
+		sd->segnum = lastseg;
+		sd->num = 0;
+		LIST_INSERT_HEAD(&ip->i_lfs_segdhd, sd, list);
+	}
+	sd->num += num;
+
+	return 0;
+}
+
+static void
+lfs_finalize_seguse(struct lfs *fs, void *v)
+{
+	SEGUSE *sup;
+	struct buf *bp;
+	struct segdelta *sd;
+	LIST_HEAD(, segdelta) *hd = v;
+
+	ASSERT_SEGLOCK(fs);
+	while((sd = LIST_FIRST(hd)) != NULL) {
+		LIST_REMOVE(sd, list);
+		LFS_SEGENTRY(sup, fs, sd->segnum, bp);
+		if (sd->num > sup->su_nbytes) {
+			printf("lfs_finalize_seguse: segment %ld short by %ld\n",
+				sd->segnum, (long)(sd->num - sup->su_nbytes));
+			panic("lfs_finalize_seguse: negative bytes");
+			sup->su_nbytes = sd->num;
+		}
+		sup->su_nbytes -= sd->num;
+		LFS_WRITESEGENTRY(sup, fs, sd->segnum, bp);
+		free(sd, M_SEGMENT);
+	}
+}
+
+/* Finish the accounting updates for a segment. */
+void
+lfs_finalize_ino_seguse(struct lfs *fs, struct inode *ip)
+{
+	ASSERT_SEGLOCK(fs);
+	lfs_finalize_seguse(fs, &ip->i_lfs_segdhd);
+}
+
+/* Finish the accounting updates for a segment. */
+void
+lfs_finalize_fs_seguse(struct lfs *fs)
+{
+	ASSERT_SEGLOCK(fs);
+	lfs_finalize_seguse(fs, &fs->lfs_segdhd);
+}
+
+/*
+ * Release blocks associated with the inode ip and stored in the indirect
+ * block bn.  Blocks are free'd in LIFO order up to (but not including)
+ * lastbn.  If level is greater than SINGLE, the block is an indirect block
+ * and recursive calls to indirtrunc must be used to cleanse other indirect
+ * blocks.
+ *
+ * NB: triple indirect blocks are untested.
+ */
+static int
+lfs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn,
+	       daddr_t lastbn, int level, long *countp,
+	       long *rcountp, long *lastsegp, size_t *bcp)
+{
+	int i;
+	struct buf *bp;
+	struct lfs *fs = ip->i_lfs;
+	int32_t *bap;	/* XXX ondisk32 */
+	struct vnode *vp;
+	daddr_t nb, nlbn, last;
+	int32_t *copy = NULL;	/* XXX ondisk32 */
+	long blkcount, rblkcount, factor;
+	int nblocks, blocksreleased = 0, real_released = 0;
+	int error = 0, allerror = 0;
+
+	ASSERT_SEGLOCK(fs);
+	/*
+	 * Calculate index in current block of last
+	 * block to be kept.  -1 indicates the entire
+	 * block so we need not calculate the index.
+	 */
+	factor = 1;
+	for (i = SINGLE; i < level; i++)
+		factor *= NINDIR(fs);
+	last = lastbn;
+	if (lastbn > 0)
+		last /= factor;
+	nblocks = btofsb(fs, fs->lfs_bsize);
+	/*
+	 * Get buffer of block pointers, zero those entries corresponding
+	 * to blocks to be free'd, and update on disk copy first.  Since
+	 * double(triple) indirect before single(double) indirect, calls
+	 * to bmap on these blocks will fail.  However, we already have
+	 * the on disk address, so we have to set the b_blkno field
+	 * explicitly instead of letting bread do everything for us.
+	 */
+	vp = ITOV(ip);
+	bp = getblk(vp, lbn, (int)fs->lfs_bsize, 0, 0);
+	if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+		/* Braces must be here in case trace evaluates to nothing. */
+		trace(TR_BREADHIT, pack(vp, fs->lfs_bsize), lbn);
+	} else {
+		trace(TR_BREADMISS, pack(vp, fs->lfs_bsize), lbn);
+		curlwp->l_ru.ru_inblock++; /* pay for read */
+		bp->b_flags |= B_READ;
+		if (bp->b_bcount > bp->b_bufsize)
+			panic("lfs_indirtrunc: bad buffer size");
+		bp->b_blkno = fsbtodb(fs, dbn);
+		VOP_STRATEGY(vp, bp);
+		error = biowait(bp);
+	}
+	if (error) {
+		brelse(bp, 0);
+		*countp = *rcountp = 0;
+		return (error);
+	}
+
+	bap = (int32_t *)bp->b_data;	/* XXX ondisk32 */
+	if (lastbn >= 0) {
+		copy = (int32_t *)lfs_malloc(fs, fs->lfs_bsize, LFS_NB_IBLOCK);
+		memcpy((void *)copy, (void *)bap, (u_int)fs->lfs_bsize);
+		memset((void *)&bap[last + 1], 0,
+		/* XXX ondisk32 */
+		  (u_int)(NINDIR(fs) - (last + 1)) * sizeof (int32_t));
+		error = VOP_BWRITE(bp->b_vp, bp);
+		if (error)
+			allerror = error;
+		bap = copy;
+	}
+
+	/*
+	 * Recursively free totally unused blocks.
+	 */
+	for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
+	    i--, nlbn += factor) {
+		nb = bap[i];
+		if (nb == 0)
+			continue;
+		if (level > SINGLE) {
+			error = lfs_indirtrunc(ip, nlbn, nb,
+					       (daddr_t)-1, level - 1,
+					       &blkcount, &rblkcount,
+					       lastsegp, bcp);
+			if (error)
+				allerror = error;
+			blocksreleased += blkcount;
+			real_released += rblkcount;
+		}
+		lfs_blkfree(fs, ip, nb, fs->lfs_bsize, lastsegp, bcp);
+		if (bap[i] > 0)
+			real_released += nblocks;
+		blocksreleased += nblocks;
+	}
+
+	/*
+	 * Recursively free last partial block.
+	 */
+	if (level > SINGLE && lastbn >= 0) {
+		last = lastbn % factor;
+		nb = bap[i];
+		if (nb != 0) {
+			error = lfs_indirtrunc(ip, nlbn, nb,
+					       last, level - 1, &blkcount,
+					       &rblkcount, lastsegp, bcp);
+			if (error)
+				allerror = error;
+			real_released += rblkcount;
+			blocksreleased += blkcount;
+		}
+	}
+
+	if (copy != NULL) {
+		lfs_free(fs, copy, LFS_NB_IBLOCK);
+	} else {
+		mutex_enter(&bufcache_lock);
+		if (bp->b_oflags & BO_DELWRI) {
+			LFS_UNLOCK_BUF(bp);
+			fs->lfs_avail += btofsb(fs, bp->b_bcount);
+			wakeup(&fs->lfs_avail);
+		}
+		brelsel(bp, BC_INVAL);
+		mutex_exit(&bufcache_lock);
+	}
+
+	*countp = blocksreleased;
+	*rcountp = real_released;
+	return (allerror);
+}
+
+/*
+ * Destroy any in core blocks past the truncation length.
+ * Inlined from vtruncbuf, so that lfs_avail could be updated.
+ * We take the seglock to prevent cleaning from occurring while we are
+ * invalidating blocks.
+ */
+static int
+lfs_vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo)
+{
+	struct buf *bp, *nbp;
+	int error;
+	struct lfs *fs;
+	voff_t off;
+
+	off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
+	mutex_enter(vp->v_interlock);
+	error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
+	if (error)
+		return error;
+
+	fs = VTOI(vp)->i_lfs;
+
+	ASSERT_SEGLOCK(fs);
+
+	mutex_enter(&bufcache_lock);
+restart:	
+	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
+		nbp = LIST_NEXT(bp, b_vnbufs);
+		if (bp->b_lblkno < lbn)
+			continue;
+		error = bbusy(bp, catch, slptimeo, NULL);
+		if (error == EPASSTHROUGH)
+			goto restart;
+		if (error != 0) {
+			mutex_exit(&bufcache_lock);
+			return (error);
+		}
+		mutex_enter(bp->b_objlock);
+		if (bp->b_oflags & BO_DELWRI) {
+			bp->b_oflags &= ~BO_DELWRI;
+			fs->lfs_avail += btofsb(fs, bp->b_bcount);
+			wakeup(&fs->lfs_avail);
+		}
+		mutex_exit(bp->b_objlock);
+		LFS_UNLOCK_BUF(bp);
+		brelsel(bp, BC_INVAL | BC_VFLUSH);
+	}
+
+	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+		nbp = LIST_NEXT(bp, b_vnbufs);
+		if (bp->b_lblkno < lbn)
+			continue;
+		error = bbusy(bp, catch, slptimeo, NULL);
+		if (error == EPASSTHROUGH)
+			goto restart;
+		if (error != 0) {
+			mutex_exit(&bufcache_lock);
+			return (error);
+		}
+		mutex_enter(bp->b_objlock);
+		if (bp->b_oflags & BO_DELWRI) {
+			bp->b_oflags &= ~BO_DELWRI;
+			fs->lfs_avail += btofsb(fs, bp->b_bcount);
+			wakeup(&fs->lfs_avail);
+		}
+		mutex_exit(bp->b_objlock);
+		LFS_UNLOCK_BUF(bp);
+		brelsel(bp, BC_INVAL | BC_VFLUSH);
+	}
+	mutex_exit(&bufcache_lock);
+
+	return (0);
+}
+
diff --git a/sys/ufs/lfs/lfs_itimes.c b/sys/ufs/lfs/lfs_itimes.c
new file mode 100644
index 000000000..3ef9f86c4
--- /dev/null
+++ b/sys/ufs/lfs/lfs_itimes.c
@@ -0,0 +1,118 @@
+/*	$NetBSD: lfs_itimes.c,v 1.12 2008/04/28 20:24:11 martin Exp $	*/
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_itimes.c,v 1.12 2008/04/28 20:24:11 martin Exp $");
+
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+
+#include <ufs/ufs/inode.h>
+
+#ifndef _KERNEL
+#include "bufcache.h"
+#include "vnode.h"
+#include "lfs_user.h"
+#define vnode uvnode
+#define buf ubuf
+#define panic call_panic
+#else
+#include <ufs/lfs/lfs_extern.h>
+#include <sys/kauth.h>
+#endif
+
+#include <ufs/lfs/lfs.h>
+
+void
+lfs_itimes(struct inode *ip, const struct timespec *acc,
+    const struct timespec *mod, const struct timespec *cre)
+{
+#ifdef _KERNEL
+	struct timespec now;
+
+	KASSERT(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY));
+
+	vfs_timestamp(&now);
+#endif
+
+	if (ip->i_flag & IN_ACCESS) {
+#ifdef _KERNEL
+		if (acc == NULL)
+			acc = &now;
+#endif
+		ip->i_ffs1_atime = acc->tv_sec;
+		ip->i_ffs1_atimensec = acc->tv_nsec;
+		if (ip->i_lfs->lfs_version > 1) {
+			struct lfs *fs = ip->i_lfs;
+			struct buf *ibp;
+			IFILE *ifp;
+
+			LFS_IENTRY(ifp, ip->i_lfs, ip->i_number, ibp);
+			ifp->if_atime_sec = acc->tv_sec;
+			ifp->if_atime_nsec = acc->tv_nsec;
+			LFS_BWRITE_LOG(ibp);
+			mutex_enter(&lfs_lock);
+			fs->lfs_flags |= LFS_IFDIRTY;
+			mutex_exit(&lfs_lock);
+		} else {
+			mutex_enter(&lfs_lock);
+			LFS_SET_UINO(ip, IN_ACCESSED);
+			mutex_exit(&lfs_lock);
+		}
+	}
+	if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFY)) {
+		if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) {
+#ifdef _KERNEL
+			if (mod == NULL)
+				mod = &now;
+#endif
+			ip->i_ffs1_mtime = mod->tv_sec;
+			ip->i_ffs1_mtimensec = mod->tv_nsec;
+			ip->i_modrev++;
+		}
+		if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) {
+#ifdef _KERNEL
+			if (cre == NULL)
+				cre = &now;
+#endif
+			ip->i_ffs1_ctime = cre->tv_sec;
+			ip->i_ffs1_ctimensec = cre->tv_nsec;
+		}
+		mutex_enter(&lfs_lock);
+		if (ip->i_flag & (IN_CHANGE | IN_UPDATE))
+			LFS_SET_UINO(ip, IN_MODIFIED);
+		if (ip->i_flag & IN_MODIFY)
+			LFS_SET_UINO(ip, IN_ACCESSED);
+		mutex_exit(&lfs_lock);
+	}
+	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
+}
diff --git a/sys/ufs/lfs/lfs_rfw.c b/sys/ufs/lfs/lfs_rfw.c
new file mode 100644
index 000000000..60d926ee4
--- /dev/null
+++ b/sys/ufs/lfs/lfs_rfw.c
@@ -0,0 +1,702 @@
+/*	$NetBSD: lfs_rfw.c,v 1.12 2009/02/22 20:28:07 ad Exp $	*/
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.12 2009/02/22 20:28:07 ad Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kthread.h>
+#include <sys/buf.h>
+#include <sys/device.h>
+#include <sys/mbuf.h>
+#include <sys/file.h>
+#include <sys/disklabel.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <uvm/uvm_extern.h>
+#include <sys/sysctl.h>
+#include <sys/conf.h>
+#include <sys/kauth.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_stat.h>
+#include <uvm/uvm_pager.h>
+#include <uvm/uvm_pdaemon.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/genfs/genfs_node.h>
+
+/*
+ * Roll-forward code.
+ */
+static daddr_t check_segsum(struct lfs *, daddr_t, u_int64_t,
+    kauth_cred_t, int, int *, struct lwp *);
+
+extern int lfs_do_rfw;
+
+/*
+ * Allocate a particular inode with a particular version number, freeing
+ * any previous versions of this inode that may have gone before.
+ * Used by the roll-forward code.
+ *
+ * XXX this function does not have appropriate locking to be used on a live fs;
+ * XXX but something similar could probably be used for an "undelete" call.
+ *
+ * Called with the Ifile inode locked.
+ */
+int
+lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l,
+	      struct vnode **vpp)
+{
+	IFILE *ifp;
+	struct buf *bp, *cbp;
+	struct vnode *vp;
+	struct inode *ip;
+	ino_t tino, oldnext;
+	int error;
+	CLEANERINFO *cip;
+
+	ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */
+
+	/*
+	 * First, just try a vget. If the version number is the one we want,
+	 * we don't have to do anything else.  If the version number is wrong,
+	 * take appropriate action.
+	 */
+	error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp);
+	if (error == 0) {
+		DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n", ino, vp));
+
+		*vpp = vp;
+		ip = VTOI(vp);
+		if (ip->i_gen == vers)
+			return 0;
+		else if (ip->i_gen < vers) {
+			lfs_truncate(vp, (off_t)0, 0, NOCRED);
+			ip->i_gen = ip->i_ffs1_gen = vers;
+			LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
+			return 0;
+		} else {
+			DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n",
+			       ino, vers, ip->i_ffs1_gen));
+			vput(vp);
+			*vpp = NULLVP;
+			return EEXIST;
+		}
+	}
+
+	/*
+	 * The inode is not in use.  Find it on the free list.
+	 */
+	/* If the Ifile is too short to contain this inum, extend it */
+	while (VTOI(fs->lfs_ivnode)->i_size <= (ino /
+		fs->lfs_ifpb + fs->lfs_cleansz + fs->lfs_segtabsz)
+		<< fs->lfs_bshift) {
+		lfs_extend_ifile(fs, NOCRED);
+	}
+
+	LFS_IENTRY(ifp, fs, ino, bp);
+	oldnext = ifp->if_nextfree;
+	ifp->if_version = vers;
+	brelse(bp, 0);
+
+	LFS_GET_HEADFREE(fs, cip, cbp, &ino);
+	if (ino) {
+		LFS_PUT_HEADFREE(fs, cip, cbp, oldnext);
+	} else {
+		tino = ino;
+		while (1) {
+			LFS_IENTRY(ifp, fs, tino, bp);
+			if (ifp->if_nextfree == ino ||
+			    ifp->if_nextfree == LFS_UNUSED_INUM)
+				break;
+			tino = ifp->if_nextfree;
+			brelse(bp, 0);
+		}
+		if (ifp->if_nextfree == LFS_UNUSED_INUM) {
+			brelse(bp, 0);
+			return ENOENT;
+		}
+		ifp->if_nextfree = oldnext;
+		LFS_BWRITE_LOG(bp);
+	}
+
+	error = lfs_ialloc(fs, fs->lfs_ivnode, ino, vers, &vp);
+	if (error == 0) {
+		/*
+		 * Make it VREG so we can put blocks on it.  We will change
+		 * this later if it turns out to be some other kind of file.
+		 */
+		ip = VTOI(vp);
+		ip->i_mode = ip->i_ffs1_mode = IFREG;
+		ip->i_nlink = ip->i_ffs1_nlink = 1;
+		ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, &vp);
+		ip = VTOI(vp);
+
+		DLOG((DLOG_RF, "lfs_rf_valloc: ino %d vp %p\n", ino, vp));
+
+		/* The dirop-nature of this vnode is past */
+		lfs_unmark_vnode(vp);
+		(void)lfs_vunref(vp);
+		vp->v_uflag &= ~VU_DIROP;
+		mutex_enter(&lfs_lock);
+		--lfs_dirvcount;
+		--fs->lfs_dirvcount;
+		TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+		wakeup(&lfs_dirvcount);
+		wakeup(&fs->lfs_dirvcount);
+		mutex_exit(&lfs_lock);
+	}
+	*vpp = vp;
+	return error;
+}
+
+/*
+ * Load the appropriate indirect block, and change the appropriate pointer.
+ * Mark the block dirty.  Do segment and avail accounting.
+ */
+static int
+update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn,
+	    daddr_t ndaddr, size_t size, struct lwp *l)
+{
+	int error;
+	struct vnode *vp;
+	struct inode *ip;
+#ifdef DEBUG
+	daddr_t odaddr;
+	struct indir a[NIADDR];
+	int num;
+	int i;
+#endif /* DEBUG */
+	struct buf *bp;
+	SEGUSE *sup;
+
+	KASSERT(lbn >= 0);	/* no indirect blocks */
+
+	if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp)) != 0) {
+		DLOG((DLOG_RF, "update_meta: ino %d: lfs_rf_valloc"
+		      " returned %d\n", ino, error));
+		return error;
+	}
+
+	if ((error = lfs_balloc(vp, (lbn << fs->lfs_bshift), size,
+				NOCRED, 0, &bp)) != 0) {
+		vput(vp);
+		return (error);
+	}
+	/* No need to write, the block is already on disk */
+	if (bp->b_oflags & BO_DELWRI) {
+		LFS_UNLOCK_BUF(bp);
+		fs->lfs_avail += btofsb(fs, bp->b_bcount);
+	}
+	brelse(bp, BC_INVAL);
+
+	/*
+	 * Extend the file, if it is not large enough already.
+	 * XXX this is not exactly right, we don't know how much of the
+	 * XXX last block is actually used.  We hope that an inode will
+	 * XXX appear later to give the correct size.
+	 */
+	ip = VTOI(vp);
+	if (ip->i_size <= (lbn << fs->lfs_bshift)) {
+		u_int64_t newsize;
+
+		if (lbn < NDADDR)
+			newsize = ip->i_ffs1_size = (lbn << fs->lfs_bshift) +
+				(size - fs->lfs_fsize) + 1;
+		else
+			newsize = ip->i_ffs1_size = (lbn << fs->lfs_bshift) + 1;
+
+		if (ip->i_size < newsize) {
+			ip->i_size = newsize;
+			/*
+			 * tell vm our new size for the case the inode won't
+			 * appear later.
+			 */
+			uvm_vnp_setsize(vp, newsize);
+		}
+	}
+
+	lfs_update_single(fs, NULL, vp, lbn, ndaddr, size);
+
+	LFS_SEGENTRY(sup, fs, dtosn(fs, ndaddr), bp);
+	sup->su_nbytes += size;
+	LFS_WRITESEGENTRY(sup, fs, dtosn(fs, ndaddr), bp);
+
+	/* differences here should be due to UNWRITTEN indirect blocks. */
+	KASSERT((lblkno(fs, ip->i_size) > NDADDR &&
+	    ip->i_lfs_effnblks == ip->i_ffs1_blocks) ||
+	    ip->i_lfs_effnblks >= ip->i_ffs1_blocks);
+
+#ifdef DEBUG
+	/* Now look again to make sure it worked */
+	ufs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL);
+	for (i = num; i > 0; i--) {
+		if (!a[i].in_exists)
+			panic("update_meta: absent %d lv indirect block", i);
+	}
+	if (dbtofsb(fs, odaddr) != ndaddr)
+		DLOG((DLOG_RF, "update_meta: failed setting ino %d lbn %"
+		      PRId64 " to %" PRId64 "\n", ino, lbn, ndaddr));
+#endif /* DEBUG */
+	vput(vp);
+	return 0;
+}
+
+static int
+update_inoblk(struct lfs *fs, daddr_t offset, kauth_cred_t cred,
+	      struct lwp *l)
+{
+	struct vnode *devvp, *vp;
+	struct inode *ip;
+	struct ufs1_dinode *dip;
+	struct buf *dbp, *ibp;
+	int error;
+	daddr_t daddr;
+	IFILE *ifp;
+	SEGUSE *sup;
+
+	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+
+	/*
+	 * Get the inode, update times and perms.
+	 * DO NOT update disk blocks, we do that separately.
+	 */
+	error = bread(devvp, fsbtodb(fs, offset), fs->lfs_ibsize,
+	    cred, 0, &dbp);
+	if (error) {
+		DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error));
+		return error;
+	}
+	dip = ((struct ufs1_dinode *)(dbp->b_data)) + INOPB(fs);
+	while (--dip >= (struct ufs1_dinode *)dbp->b_data) {
+		if (dip->di_inumber > LFS_IFILE_INUM) {
+			error = lfs_rf_valloc(fs, dip->di_inumber, dip->di_gen,
+					      l, &vp);
+			if (error) {
+				DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc"
+				      " returned %d\n", error));
+				continue;
+			}
+			ip = VTOI(vp);
+			if (dip->di_size != ip->i_size)
+				lfs_truncate(vp, dip->di_size, 0, NOCRED);
+			/* Get mode, link count, size, and times */
+			memcpy(ip->i_din.ffs1_din, dip,
+			       offsetof(struct ufs1_dinode, di_db[0]));
+
+			/* Then the rest, except di_blocks */
+			ip->i_flags = ip->i_ffs1_flags = dip->di_flags;
+			ip->i_gen = ip->i_ffs1_gen = dip->di_gen;
+			ip->i_uid = ip->i_ffs1_uid = dip->di_uid;
+			ip->i_gid = ip->i_ffs1_gid = dip->di_gid;
+
+			ip->i_mode = ip->i_ffs1_mode;
+			ip->i_nlink = ip->i_ffs1_nlink;
+			ip->i_size = ip->i_ffs1_size;
+
+			LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
+
+			/* Re-initialize to get type right */
+			ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p,
+				  &vp);
+			vput(vp);
+
+			/* Record change in location */
+			LFS_IENTRY(ifp, fs, dip->di_inumber, ibp);
+			daddr = ifp->if_daddr;
+			ifp->if_daddr = dbtofsb(fs, dbp->b_blkno);
+			error = LFS_BWRITE_LOG(ibp); /* Ifile */
+			/* And do segment accounting */
+			if (dtosn(fs, daddr) != dtosn(fs, dbtofsb(fs, dbp->b_blkno))) {
+				if (daddr > 0) {
+					LFS_SEGENTRY(sup, fs, dtosn(fs, daddr),
+						     ibp);
+					sup->su_nbytes -= sizeof (struct ufs1_dinode);
+					LFS_WRITESEGENTRY(sup, fs,
+							  dtosn(fs, daddr),
+							  ibp);
+				}
+				LFS_SEGENTRY(sup, fs, dtosn(fs, dbtofsb(fs, dbp->b_blkno)),
+					     ibp);
+				sup->su_nbytes += sizeof (struct ufs1_dinode);
+				LFS_WRITESEGENTRY(sup, fs,
+						  dtosn(fs, dbtofsb(fs, dbp->b_blkno)),
+						  ibp);
+			}
+		}
+	}
+	brelse(dbp, BC_AGE);
+
+	return 0;
+}
+
+#define CHECK_CKSUM   0x0001  /* Check the checksum to make sure it's valid */
+#define CHECK_UPDATE  0x0002  /* Update Ifile for new data blocks / inodes */
+
+static daddr_t
+check_segsum(struct lfs *fs, daddr_t offset, u_int64_t nextserial,
+	     kauth_cred_t cred, int flags, int *pseg_flags, struct lwp *l)
+{
+	struct vnode *devvp;
+	struct buf *bp, *dbp;
+	int error, nblocks = 0, ninos, i, j; /* XXX: gcc */
+	SEGSUM *ssp;
+	u_long *dp = NULL, *datap = NULL; /* XXX u_int32_t */
+	daddr_t oldoffset;
+	int32_t *iaddr;	/* XXX ondisk32 */
+	FINFO *fip;
+	SEGUSE *sup;
+	size_t size;
+
+	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+	/*
+	 * If the segment has a superblock and we're at the top
+	 * of the segment, skip the superblock.
+	 */
+	if (sntod(fs, dtosn(fs, offset)) == offset) {
+		LFS_SEGENTRY(sup, fs, dtosn(fs, offset), bp);
+		if (sup->su_flags & SEGUSE_SUPERBLOCK)
+			offset += btofsb(fs, LFS_SBPAD);
+		brelse(bp, 0);
+	}
+
+	/* Read in the segment summary */
+	error = bread(devvp, fsbtodb(fs, offset), fs->lfs_sumsize,
+	    cred, 0, &bp);
+	if (error)
+		return -1;
+
+	/* Check summary checksum */
+	ssp = (SEGSUM *)bp->b_data;
+	if (flags & CHECK_CKSUM) {
+		if (ssp->ss_sumsum != cksum(&ssp->ss_datasum,
+					   fs->lfs_sumsize -
+					   sizeof(ssp->ss_sumsum))) {
+			DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n", offset));
+			offset = -1;
+			goto err1;
+		}
+		if (ssp->ss_nfinfo == 0 && ssp->ss_ninos == 0) {
+			DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n", offset));
+			offset = -1;
+			goto err1;
+		}
+		if (ssp->ss_create < fs->lfs_tstamp) {
+			DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset));
+			offset = -1;
+			goto err1;
+		}
+	}
+	if (fs->lfs_version > 1) {
+		if (ssp->ss_serial != nextserial) {
+			DLOG((DLOG_RF, "Unexpected serial number at 0x%" PRIx64
+			      "\n", offset));
+			offset = -1;
+			goto err1;
+		}
+		if (ssp->ss_ident != fs->lfs_ident) {
+			DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%"
+			      PRIx64 "\n", ssp->ss_ident, fs->lfs_ident, offset));
+			offset = -1;
+			goto err1;
+		}
+	}
+	if (pseg_flags)
+		*pseg_flags = ssp->ss_flags;
+	oldoffset = offset;
+	offset += btofsb(fs, fs->lfs_sumsize);
+
+	ninos = howmany(ssp->ss_ninos, INOPB(fs));
+	/* XXX ondisk32 */
+	iaddr = (int32_t *)((char*)bp->b_data + fs->lfs_sumsize - sizeof(int32_t));
+	if (flags & CHECK_CKSUM) {
+		/* Count blocks */
+		nblocks = 0;
+		fip = (FINFO *)((char*)bp->b_data + SEGSUM_SIZE(fs));
+		for (i = 0; i < ssp->ss_nfinfo; ++i) {
+			nblocks += fip->fi_nblocks;
+			if (fip->fi_nblocks <= 0)
+				break;
+			/* XXX ondisk32 */
+			fip = (FINFO *)(((char *)fip) + FINFOSIZE +
+					(fip->fi_nblocks * sizeof(int32_t)));
+		}
+		nblocks += ninos;
+		/* Create the sum array */
+		datap = dp = (u_long *)malloc(nblocks * sizeof(u_long),
+					      M_SEGMENT, M_WAITOK);
+	}
+
+	/* Handle individual blocks */
+	fip = (FINFO *)((char*)bp->b_data + SEGSUM_SIZE(fs));
+	for (i = 0; i < ssp->ss_nfinfo || ninos; ++i) {
+		/* Inode block? */
+		if (ninos && *iaddr == offset) {
+			if (flags & CHECK_CKSUM) {
+				/* Read in the head and add to the buffer */
+				error = bread(devvp, fsbtodb(fs, offset), fs->lfs_bsize,
+					      cred, 0, &dbp);
+				if (error) {
+					offset = -1;
+					goto err2;
+				}
+				(*dp++) = ((u_long *)(dbp->b_data))[0];
+				brelse(dbp, BC_AGE);
+			}
+			if (flags & CHECK_UPDATE) {
+				if ((error = update_inoblk(fs, offset, cred, l))
+				    != 0) {
+					offset = -1;
+					goto err2;
+				}
+			}
+			offset += btofsb(fs, fs->lfs_ibsize);
+			--iaddr;
+			--ninos;
+			--i; /* compensate */
+			continue;
+		}
+		size = fs->lfs_bsize;
+		for (j = 0; j < fip->fi_nblocks; ++j) {
+			if (j == fip->fi_nblocks - 1)
+				size = fip->fi_lastlength;
+			if (flags & CHECK_CKSUM) {
+				error = bread(devvp, fsbtodb(fs, offset), size,
+				    cred, 0, &dbp);
+				if (error) {
+					offset = -1;
+					goto err2;
+				}
+				(*dp++) = ((u_long *)(dbp->b_data))[0];
+				brelse(dbp, BC_AGE);
+			}
+			/* Account for and update any direct blocks */
+			if ((flags & CHECK_UPDATE) &&
+			   fip->fi_ino > LFS_IFILE_INUM &&
+			   fip->fi_blocks[j] >= 0) {
+				update_meta(fs, fip->fi_ino, fip->fi_version,
+					    fip->fi_blocks[j], offset, size, l);
+			}
+			offset += btofsb(fs, size);
+		}
+		/* XXX ondisk32 */
+		fip = (FINFO *)(((char *)fip) + FINFOSIZE
+				+ fip->fi_nblocks * sizeof(int32_t));
+	}
+	/* Checksum the array, compare */
+	if ((flags & CHECK_CKSUM) &&
+	   ssp->ss_datasum != cksum(datap, nblocks * sizeof(u_long)))
+	{
+		DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64
+		      " (wanted %x got %x)\n",
+		      offset, ssp->ss_datasum, cksum(datap, nblocks *
+						     sizeof(u_long))));
+		offset = -1;
+		goto err2;
+	}
+
+	/* If we're at the end of the segment, move to the next */
+	if (dtosn(fs, offset + btofsb(fs, fs->lfs_sumsize + fs->lfs_bsize)) !=
+	   dtosn(fs, offset)) {
+		if (dtosn(fs, offset) == dtosn(fs, ssp->ss_next)) {
+			offset = -1;
+			goto err2;
+		}
+		offset = ssp->ss_next;
+		DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64
+		       " -> segment %d\n", offset, dtosn(fs,offset)));
+	}
+
+	if (flags & CHECK_UPDATE) {
+		fs->lfs_avail -= (offset - oldoffset);
+		/* Don't clog the buffer queue */
+		mutex_enter(&lfs_lock);
+		if (locked_queue_count > LFS_MAX_BUFS ||
+		    locked_queue_bytes > LFS_MAX_BYTES) {
+			lfs_flush(fs, SEGM_CKP, 0);
+		}
+		mutex_exit(&lfs_lock);
+	}
+
+    err2:
+	if (flags & CHECK_CKSUM)
+		free(datap, M_SEGMENT);
+    err1:
+	brelse(bp, BC_AGE);
+
+	/* XXX should we update the serial number even for bad psegs? */
+	if ((flags & CHECK_UPDATE) && offset > 0 && fs->lfs_version > 1)
+		fs->lfs_serial = nextserial;
+	return offset;
+}
+
+void
+lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l)
+{
+	int flags, dirty;
+	daddr_t offset, oldoffset, lastgoodpseg;
+	int sn, curseg, do_rollforward;
+	struct proc *p;
+	kauth_cred_t cred;
+	SEGUSE *sup;
+	struct buf *bp;
+
+	p = l ? l->l_proc : NULL;
+	cred = p ? p->p_cred : NOCRED;
+
+	/*
+	 * Roll forward.
+	 *
+	 * We don't roll forward for v1 filesystems, because
+	 * of the danger that the clock was turned back between the last
+	 * checkpoint and crash.  This would roll forward garbage.
+	 *
+	 * v2 filesystems don't have this problem because they use a
+	 * monotonically increasing serial number instead of a timestamp.
+	 */
+	do_rollforward = (!(fs->lfs_pflags & LFS_PF_CLEAN) &&
+			  lfs_do_rfw && fs->lfs_version > 1 && p != NULL);
+	if (do_rollforward) {
+		u_int64_t nextserial;
+		/*
+		 * Phase I: Find the address of the last good partial
+		 * segment that was written after the checkpoint.  Mark
+		 * the segments in question dirty, so they won't be
+		 * reallocated.
+		 */
+		lastgoodpseg = oldoffset = offset = fs->lfs_offset;
+		flags = 0x0;
+		DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%"
+		      PRIx64 "\n", offset));
+		LFS_SEGENTRY(sup, fs, dtosn(fs, offset), bp);
+		if (!(sup->su_flags & SEGUSE_DIRTY))
+			--fs->lfs_nclean;
+		sup->su_flags |= SEGUSE_DIRTY;
+		LFS_WRITESEGENTRY(sup, fs, dtosn(fs, offset), bp);
+		nextserial = fs->lfs_serial + 1;
+		while ((offset = check_segsum(fs, offset, nextserial,
+		    cred, CHECK_CKSUM, &flags, l)) > 0) {
+			nextserial++;
+			if (sntod(fs, oldoffset) != sntod(fs, offset)) {
+				LFS_SEGENTRY(sup, fs, dtosn(fs, oldoffset),
+					     bp);
+				if (!(sup->su_flags & SEGUSE_DIRTY))
+					--fs->lfs_nclean;
+				sup->su_flags |= SEGUSE_DIRTY;
+				LFS_WRITESEGENTRY(sup, fs, dtosn(fs, oldoffset),
+					     bp);
+			}
+
+			DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%"
+			      PRIx64 "\n", offset));
+			if (flags & SS_DIROP) {
+				DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%"
+				      PRIx64 "\n", oldoffset));
+				if (!(flags & SS_CONT)) {
+				     DLOG((DLOG_RF, "lfs_mountfs: dirops end "
+					   "at 0x%" PRIx64 "\n", oldoffset));
+				}
+			}
+			if (!(flags & SS_CONT))
+				lastgoodpseg = offset;
+			oldoffset = offset;
+		}
+		if (flags & SS_CONT) {
+			DLOG((DLOG_RF, "LFS roll forward: warning: incomplete "
+			      "dirops discarded\n"));
+		}
+		DLOG((DLOG_RF, "LFS roll forward phase 1: completed: "
+		      "lastgoodpseg=0x%" PRIx64 "\n", lastgoodpseg));
+		oldoffset = fs->lfs_offset;
+		if (fs->lfs_offset != lastgoodpseg) {
+			/* Don't overwrite what we're trying to preserve */
+			offset = fs->lfs_offset;
+			fs->lfs_offset = lastgoodpseg;
+			fs->lfs_curseg = sntod(fs, dtosn(fs, fs->lfs_offset));
+			for (sn = curseg = dtosn(fs, fs->lfs_curseg);;) {
+				sn = (sn + 1) % fs->lfs_nseg;
+				if (sn == curseg)
+					panic("lfs_mountfs: no clean segments");
+				LFS_SEGENTRY(sup, fs, sn, bp);
+				dirty = (sup->su_flags & SEGUSE_DIRTY);
+				brelse(bp, 0);
+				if (!dirty)
+					break;
+			}
+			fs->lfs_nextseg = sntod(fs, sn);
+
+			/*
+			 * Phase II: Roll forward from the first superblock.
+			 */
+			while (offset != lastgoodpseg) {
+				DLOG((DLOG_RF, "LFS roll forward phase 2: 0x%"
+				      PRIx64 "\n", offset));
+				offset = check_segsum(fs, offset,
+				    fs->lfs_serial + 1, cred, CHECK_UPDATE,
+				    NULL, l);
+			}
+
+			/*
+			 * Finish: flush our changes to disk.
+			 */
+			lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
+			DLOG((DLOG_RF, "lfs_mountfs: roll forward ",
+			      "recovered %lld blocks\n",
+			      (long long)(lastgoodpseg - oldoffset)));
+		}
+		DLOG((DLOG_RF, "LFS roll forward complete\n"));
+	}
+}
diff --git a/sys/ufs/lfs/lfs_segment.c b/sys/ufs/lfs/lfs_segment.c
new file mode 100644
index 000000000..aea143a5c
--- /dev/null
+++ b/sys/ufs/lfs/lfs_segment.c
@@ -0,0 +1,2829 @@
+/*	$NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $	*/
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)lfs_segment.c	8.10 (Berkeley) 6/10/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $");
+
+#ifdef DEBUG
+# define vndebug(vp, str) do {						\
+	if (VTOI(vp)->i_flag & IN_CLEANING)				\
+		DLOG((DLOG_WVNODE, "not writing ino %d because %s (op %d)\n", \
+		     VTOI(vp)->i_number, (str), op));			\
+} while(0)
+#else
+# define vndebug(vp, str)
+#endif
+#define ivndebug(vp, str) \
+	DLOG((DLOG_WVNODE, "ino %d: %s\n", VTOI(vp)->i_number, (str)))
+
+#if defined(_KERNEL_OPT)
+#include "opt_ddb.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/kernel.h>
+#include <sys/resourcevar.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+#include <sys/syslog.h>
+
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/fifofs/fifo.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_extern.h>
+
+MALLOC_JUSTDEFINE(M_SEGMENT, "LFS segment", "Segment for LFS");
+
+static void lfs_generic_callback(struct buf *, void (*)(struct buf *));
+static void lfs_free_aiodone(struct buf *);
+static void lfs_super_aiodone(struct buf *);
+static void lfs_cluster_aiodone(struct buf *);
+static void lfs_cluster_callback(struct buf *);
+
+/*
+ * Determine if it's OK to start a partial in this segment, or if we need
+ * to go on to a new segment.
+ */
+#define	LFS_PARTIAL_FITS(fs) \
+	((fs)->lfs_fsbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \
+	(fs)->lfs_frag)
+
+/*
+ * Figure out whether we should do a checkpoint write or go ahead with
+ * an ordinary write.
+ */
+#define LFS_SHOULD_CHECKPOINT(fs, flags) \
+        ((flags & SEGM_CLEAN) == 0 &&					\
+	  ((fs->lfs_nactive > LFS_MAX_ACTIVE ||				\
+	    (flags & SEGM_CKP) ||					\
+	    fs->lfs_nclean < LFS_MAX_ACTIVE)))
+
+int	 lfs_match_fake(struct lfs *, struct buf *);
+void	 lfs_newseg(struct lfs *);
+/* XXX ondisk32 */
+void	 lfs_shellsort(struct buf **, int32_t *, int, int);
+void	 lfs_supercallback(struct buf *);
+void	 lfs_updatemeta(struct segment *);
+void	 lfs_writesuper(struct lfs *, daddr_t);
+int	 lfs_writevnodes(struct lfs *fs, struct mount *mp,
+	    struct segment *sp, int dirops);
+
+int	lfs_allclean_wakeup;		/* Cleaner wakeup address. */
+int	lfs_writeindir = 1;		/* whether to flush indir on non-ckp */
+int	lfs_clean_vnhead = 0;		/* Allow freeing to head of vn list */
+int	lfs_dirvcount = 0;		/* # active dirops */
+
+/* Statistics Counters */
+int lfs_dostats = 1;
+struct lfs_stats lfs_stats;
+
+/* op values to lfs_writevnodes */
+#define	VN_REG		0
+#define	VN_DIROP	1
+#define	VN_EMPTY	2
+#define VN_CLEAN	3
+
+/*
+ * XXX KS - Set modification time on the Ifile, so the cleaner can
+ * read the fs mod time off of it.  We don't set IN_UPDATE here,
+ * since we don't really need this to be flushed to disk (and in any
+ * case that wouldn't happen to the Ifile until we checkpoint).
+ */
+void
+lfs_imtime(struct lfs *fs)
+{
+	struct timespec ts;
+	struct inode *ip;
+
+	ASSERT_MAYBE_SEGLOCK(fs);
+	vfs_timestamp(&ts);
+	ip = VTOI(fs->lfs_ivnode);
+	ip->i_ffs1_mtime = ts.tv_sec;
+	ip->i_ffs1_mtimensec = ts.tv_nsec;
+}
+
+/*
+ * Ifile and meta data blocks are not marked busy, so segment writes MUST be
+ * single threaded.  Currently, there are two paths into lfs_segwrite, sync()
+ * and getnewbuf().  They both mark the file system busy.  Lfs_vflush()
+ * explicitly marks the file system busy.  So lfs_segwrite is safe.  I think.
+ */
+
+#define IS_FLUSHING(fs,vp)  ((fs)->lfs_flushvp == (vp))
+
+int
+lfs_vflush(struct vnode *vp)
+{
+	struct inode *ip;
+	struct lfs *fs;
+	struct segment *sp;
+	struct buf *bp, *nbp, *tbp, *tnbp;
+	int error;
+	int flushed;
+	int relock;
+	int loopcount;
+
+	ip = VTOI(vp);
+	fs = VFSTOUFS(vp->v_mount)->um_lfs;
+	relock = 0;
+
+    top:
+	ASSERT_NO_SEGLOCK(fs);
+	if (ip->i_flag & IN_CLEANING) {
+		ivndebug(vp,"vflush/in_cleaning");
+		mutex_enter(&lfs_lock);
+		LFS_CLR_UINO(ip, IN_CLEANING);
+		LFS_SET_UINO(ip, IN_MODIFIED);
+		mutex_exit(&lfs_lock);
+
+		/*
+		 * Toss any cleaning buffers that have real counterparts
+		 * to avoid losing new data.
+		 */
+		mutex_enter(vp->v_interlock);
+		for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+			nbp = LIST_NEXT(bp, b_vnbufs);
+			if (!LFS_IS_MALLOC_BUF(bp))
+				continue;
+			/*
+			 * Look for pages matching the range covered
+			 * by cleaning blocks.  It's okay if more dirty
+			 * pages appear, so long as none disappear out
+			 * from under us.
+			 */
+			if (bp->b_lblkno > 0 && vp->v_type == VREG &&
+			    vp != fs->lfs_ivnode) {
+				struct vm_page *pg;
+				voff_t off;
+
+				for (off = lblktosize(fs, bp->b_lblkno);
+				     off < lblktosize(fs, bp->b_lblkno + 1);
+				     off += PAGE_SIZE) {
+					pg = uvm_pagelookup(&vp->v_uobj, off);
+					if (pg == NULL)
+						continue;
+					if ((pg->flags & PG_CLEAN) == 0 ||
+					    pmap_is_modified(pg)) {
+						fs->lfs_avail += btofsb(fs,
+							bp->b_bcount);
+						wakeup(&fs->lfs_avail);
+						mutex_exit(vp->v_interlock);
+						lfs_freebuf(fs, bp);
+						mutex_enter(vp->v_interlock);
+						bp = NULL;
+						break;
+					}
+				}
+			}
+			for (tbp = LIST_FIRST(&vp->v_dirtyblkhd); tbp;
+			    tbp = tnbp)
+			{
+				tnbp = LIST_NEXT(tbp, b_vnbufs);
+				if (tbp->b_vp == bp->b_vp
+				   && tbp->b_lblkno == bp->b_lblkno
+				   && tbp != bp)
+				{
+					fs->lfs_avail += btofsb(fs,
+						bp->b_bcount);
+					wakeup(&fs->lfs_avail);
+					mutex_exit(vp->v_interlock);
+					lfs_freebuf(fs, bp);
+					mutex_enter(vp->v_interlock);
+					bp = NULL;
+					break;
+				}
+			}
+		}
+	} else {
+		mutex_enter(vp->v_interlock);
+	}
+
+	/* If the node is being written, wait until that is done */
+	while (WRITEINPROG(vp)) {
+		ivndebug(vp,"vflush/writeinprog");
+		cv_wait(&vp->v_cv, vp->v_interlock);
+	}
+	mutex_exit(vp->v_interlock);
+
+	/* Protect against VI_XLOCK deadlock in vinvalbuf() */
+	lfs_seglock(fs, SEGM_SYNC);
+
+	/* If we're supposed to flush a freed inode, just toss it */
+	if (ip->i_lfs_iflags & LFSI_DELETED) {
+		DLOG((DLOG_VNODE, "lfs_vflush: ino %d freed, not flushing\n",
+		      ip->i_number));
+		/* Drain v_numoutput */
+		mutex_enter(vp->v_interlock);
+		while (vp->v_numoutput > 0) {
+			cv_wait(&vp->v_cv, vp->v_interlock);
+		}
+		KASSERT(vp->v_numoutput == 0);
+		mutex_exit(vp->v_interlock);
+	
+		mutex_enter(&bufcache_lock);
+		for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+			nbp = LIST_NEXT(bp, b_vnbufs);
+
+			KASSERT((bp->b_flags & B_GATHERED) == 0);
+			if (bp->b_oflags & BO_DELWRI) { /* XXX always true? */
+				fs->lfs_avail += btofsb(fs, bp->b_bcount);
+				wakeup(&fs->lfs_avail);
+			}
+			/* Copied from lfs_writeseg */
+			if (bp->b_iodone != NULL) {
+				mutex_exit(&bufcache_lock);
+				biodone(bp);
+				mutex_enter(&bufcache_lock);
+			} else {
+				bremfree(bp);
+				LFS_UNLOCK_BUF(bp);
+				mutex_enter(vp->v_interlock);
+				bp->b_flags &= ~(B_READ | B_GATHERED);
+				bp->b_oflags = (bp->b_oflags & ~BO_DELWRI) | BO_DONE;
+				bp->b_error = 0;
+				reassignbuf(bp, vp);
+				mutex_exit(vp->v_interlock);
+				brelse(bp, 0);
+			}
+		}
+		mutex_exit(&bufcache_lock);
+		LFS_CLR_UINO(ip, IN_CLEANING);
+		LFS_CLR_UINO(ip, IN_MODIFIED | IN_ACCESSED);
+		ip->i_flag &= ~IN_ALLMOD;
+		DLOG((DLOG_VNODE, "lfs_vflush: done not flushing ino %d\n",
+		      ip->i_number));
+		lfs_segunlock(fs);
+
+		KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
+
+		return 0;
+	}
+
+	fs->lfs_flushvp = vp;
+	if (LFS_SHOULD_CHECKPOINT(fs, fs->lfs_sp->seg_flags)) {
+		error = lfs_segwrite(vp->v_mount, SEGM_CKP | SEGM_SYNC);
+		fs->lfs_flushvp = NULL;
+		KASSERT(fs->lfs_flushvp_fakevref == 0);
+		lfs_segunlock(fs);
+
+		/* Make sure that any pending buffers get written */
+		mutex_enter(vp->v_interlock);
+		while (vp->v_numoutput > 0) {
+			cv_wait(&vp->v_cv, vp->v_interlock);
+		}
+		KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
+		KASSERT(vp->v_numoutput == 0);
+		mutex_exit(vp->v_interlock);
+
+		return error;
+	}
+	sp = fs->lfs_sp;
+
+	flushed = 0;
+	if (VPISEMPTY(vp)) {
+		lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY);
+		++flushed;
+	} else if ((ip->i_flag & IN_CLEANING) &&
+		  (fs->lfs_sp->seg_flags & SEGM_CLEAN)) {
+		ivndebug(vp,"vflush/clean");
+		lfs_writevnodes(fs, vp->v_mount, sp, VN_CLEAN);
+		++flushed;
+	} else if (lfs_dostats) {
+		if (!VPISEMPTY(vp) || (VTOI(vp)->i_flag & IN_ALLMOD))
+			++lfs_stats.vflush_invoked;
+		ivndebug(vp,"vflush");
+	}
+
+#ifdef DIAGNOSTIC
+	if (vp->v_uflag & VU_DIROP) {
+		DLOG((DLOG_VNODE, "lfs_vflush: flushing VU_DIROP\n"));
+		/* panic("lfs_vflush: VU_DIROP being flushed...this can\'t happen"); */
+	}
+#endif
+
+	do {
+		loopcount = 0;
+		do {
+			if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
+				relock = lfs_writefile(fs, sp, vp);
+				if (relock) {
+					/*
+					 * Might have to wait for the
+					 * cleaner to run; but we're
+					 * still not done with this vnode.
+					 */
+					KDASSERT(ip->i_number != LFS_IFILE_INUM);
+					lfs_writeinode(fs, sp, ip);
+					mutex_enter(&lfs_lock);
+					LFS_SET_UINO(ip, IN_MODIFIED);
+					mutex_exit(&lfs_lock);
+					lfs_writeseg(fs, sp);
+					lfs_segunlock(fs);
+					lfs_segunlock_relock(fs);
+					goto top;
+				}
+			}
+			/*
+			 * If we begin a new segment in the middle of writing
+			 * the Ifile, it creates an inconsistent checkpoint,
+			 * since the Ifile information for the new segment
+			 * is not up-to-date.  Take care of this here by
+			 * sending the Ifile through again in case there
+			 * are newly dirtied blocks.  But wait, there's more!
+			 * This second Ifile write could *also* cross a segment
+			 * boundary, if the first one was large.  The second
+			 * one is guaranteed to be no more than 8 blocks,
+			 * though (two segment blocks and supporting indirects)
+			 * so the third write *will not* cross the boundary.
+			 */
+			if (vp == fs->lfs_ivnode) {
+				lfs_writefile(fs, sp, vp);
+				lfs_writefile(fs, sp, vp);
+			}
+#ifdef DEBUG
+			if (++loopcount > 2)
+				log(LOG_NOTICE, "lfs_vflush: looping count=%d\n", loopcount);
+#endif
+		} while (lfs_writeinode(fs, sp, ip));
+	} while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM);
+
+	if (lfs_dostats) {
+		++lfs_stats.nwrites;
+		if (sp->seg_flags & SEGM_SYNC)
+			++lfs_stats.nsync_writes;
+		if (sp->seg_flags & SEGM_CKP)
+			++lfs_stats.ncheckpoints;
+	}
+	/*
+	 * If we were called from somewhere that has already held the seglock
+	 * (e.g., lfs_markv()), the lfs_segunlock will not wait for
+	 * the write to complete because we are still locked.
+	 * Since lfs_vflush() must return the vnode with no dirty buffers,
+	 * we must explicitly wait, if that is the case.
+	 *
+	 * We compare the iocount against 1, not 0, because it is
+	 * artificially incremented by lfs_seglock().
+	 */
+	mutex_enter(&lfs_lock);
+	if (fs->lfs_seglock > 1) {
+		while (fs->lfs_iocount > 1)
+			(void)mtsleep(&fs->lfs_iocount, PRIBIO + 1,
+				     "lfs_vflush", 0, &lfs_lock);
+	}
+	mutex_exit(&lfs_lock);
+
+	lfs_segunlock(fs);
+
+	/* Wait for these buffers to be recovered by aiodoned */
+	mutex_enter(vp->v_interlock);
+	while (vp->v_numoutput > 0) {
+		cv_wait(&vp->v_cv, vp->v_interlock);
+	}
+	KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
+	KASSERT(vp->v_numoutput == 0);
+	mutex_exit(vp->v_interlock);
+
+	fs->lfs_flushvp = NULL;
+	KASSERT(fs->lfs_flushvp_fakevref == 0);
+
+	return (0);
+}
+
+int
+lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op)
+{
+	struct inode *ip;
+	struct vnode *vp;
+	int inodes_written = 0, only_cleaning;
+	int error = 0;
+
+	ASSERT_SEGLOCK(fs);
+ loop:
+	/* start at last (newest) vnode. */
+	mutex_enter(&mntvnode_lock);
+	TAILQ_FOREACH_REVERSE(vp, &mp->mnt_vnodelist, vnodelst, v_mntvnodes) {
+		/*
+		 * If the vnode that we are about to sync is no longer
+		 * associated with this mount point, start over.
+		 */
+		if (vp->v_mount != mp) {
+			DLOG((DLOG_VNODE, "lfs_writevnodes: starting over\n"));
+			/*
+			 * After this, pages might be busy
+			 * due to our own previous putpages.
+			 * Start actual segment write here to avoid deadlock.
+			 */
+			mutex_exit(&mntvnode_lock);
+			(void)lfs_writeseg(fs, sp);
+			goto loop;
+		}
+
+		mutex_enter(vp->v_interlock);
+		if (vp->v_type == VNON || vismarker(vp) ||
+		    (vp->v_iflag & VI_CLEAN) != 0) {
+			mutex_exit(vp->v_interlock);
+			continue;
+		}
+
+		ip = VTOI(vp);
+		if ((op == VN_DIROP && !(vp->v_uflag & VU_DIROP)) ||
+		    (op != VN_DIROP && op != VN_CLEAN &&
+		    (vp->v_uflag & VU_DIROP))) {
+			mutex_exit(vp->v_interlock);
+			vndebug(vp,"dirop");
+			continue;
+		}
+
+		if (op == VN_EMPTY && !VPISEMPTY(vp)) {
+			mutex_exit(vp->v_interlock);
+			vndebug(vp,"empty");
+			continue;
+		}
+
+		if (op == VN_CLEAN && ip->i_number != LFS_IFILE_INUM
+		   && vp != fs->lfs_flushvp
+		   && !(ip->i_flag & IN_CLEANING)) {
+			mutex_exit(vp->v_interlock);
+			vndebug(vp,"cleaning");
+			continue;
+		}
+
+		mutex_exit(&mntvnode_lock);
+		if (lfs_vref(vp)) {
+			vndebug(vp,"vref");
+			mutex_enter(&mntvnode_lock);
+			continue;
+		}
+
+		only_cleaning = 0;
+		/*
+		 * Write the inode/file if dirty and it's not the IFILE.
+		 */
+		if ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp)) {
+			only_cleaning =
+			    ((ip->i_flag & IN_ALLMOD) == IN_CLEANING);
+
+			if (ip->i_number != LFS_IFILE_INUM) {
+				error = lfs_writefile(fs, sp, vp);
+				if (error) {
+					lfs_vunref(vp);
+					if (error == EAGAIN) {
+						/*
+						 * This error from lfs_putpages
+						 * indicates we need to drop
+						 * the segment lock and start
+						 * over after the cleaner has
+						 * had a chance to run.
+						 */
+						lfs_writeinode(fs, sp, ip);
+						lfs_writeseg(fs, sp);
+						if (!VPISEMPTY(vp) &&
+						    !WRITEINPROG(vp) &&
+						    !(ip->i_flag & IN_ALLMOD)) {
+							mutex_enter(&lfs_lock);
+							LFS_SET_UINO(ip, IN_MODIFIED);
+							mutex_exit(&lfs_lock);
+						}
+						mutex_enter(&mntvnode_lock);
+						break;
+					}
+					error = 0; /* XXX not quite right */
+					mutex_enter(&mntvnode_lock);
+					continue;
+				}
+				
+				if (!VPISEMPTY(vp)) {
+					if (WRITEINPROG(vp)) {
+						ivndebug(vp,"writevnodes/write2");
+					} else if (!(ip->i_flag & IN_ALLMOD)) {
+						mutex_enter(&lfs_lock);
+						LFS_SET_UINO(ip, IN_MODIFIED);
+						mutex_exit(&lfs_lock);
+					}
+				}
+				(void) lfs_writeinode(fs, sp, ip);
+				inodes_written++;
+			}
+		}
+
+		if (lfs_clean_vnhead && only_cleaning)
+			lfs_vunref_head(vp);
+		else
+			lfs_vunref(vp);
+
+		mutex_enter(&mntvnode_lock);
+	}
+	mutex_exit(&mntvnode_lock);
+	return error;
+}
+
+/*
+ * Do a checkpoint.
+ */
+int
+lfs_segwrite(struct mount *mp, int flags)
+{
+	struct buf *bp;
+	struct inode *ip;
+	struct lfs *fs;
+	struct segment *sp;
+	struct vnode *vp;
+	SEGUSE *segusep;
+	int do_ckp, did_ckp, error;
+	unsigned n, segleft, maxseg, sn, i, curseg;
+	int writer_set = 0;
+	int dirty;
+	int redo;
+	int um_error;
+	int loopcount;
+
+	fs = VFSTOUFS(mp)->um_lfs;
+	ASSERT_MAYBE_SEGLOCK(fs);
+
+	if (fs->lfs_ronly)
+		return EROFS;
+
+	lfs_imtime(fs);
+
+	/*
+	 * Allocate a segment structure and enough space to hold pointers to
+	 * the maximum possible number of buffers which can be described in a
+	 * single summary block.
+	 */
+	do_ckp = LFS_SHOULD_CHECKPOINT(fs, flags);
+
+	lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0));
+	sp = fs->lfs_sp;
+	if (sp->seg_flags & (SEGM_CLEAN | SEGM_CKP))
+		do_ckp = 1;
+
+	/*
+	 * If lfs_flushvp is non-NULL, we are called from lfs_vflush,
+	 * in which case we have to flush *all* buffers off of this vnode.
+	 * We don't care about other nodes, but write any non-dirop nodes
+	 * anyway in anticipation of another getnewvnode().
+	 *
+	 * If we're cleaning we only write cleaning and ifile blocks, and
+	 * no dirops, since otherwise we'd risk corruption in a crash.
+	 */
+	if (sp->seg_flags & SEGM_CLEAN)
+		lfs_writevnodes(fs, mp, sp, VN_CLEAN);
+	else if (!(sp->seg_flags & SEGM_FORCE_CKP)) {
+		do {
+			um_error = lfs_writevnodes(fs, mp, sp, VN_REG);
+
+			if (do_ckp || fs->lfs_dirops == 0) {
+				if (!writer_set) {
+					lfs_writer_enter(fs, "lfs writer");
+					writer_set = 1;
+				}
+				error = lfs_writevnodes(fs, mp, sp, VN_DIROP);
+				if (um_error == 0)
+					um_error = error;
+				/* In case writevnodes errored out */
+				lfs_flush_dirops(fs);
+				((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT);
+				lfs_finalize_fs_seguse(fs);
+			}
+			if (do_ckp && um_error) {
+				lfs_segunlock_relock(fs);
+				sp = fs->lfs_sp;
+			}
+		} while (do_ckp && um_error != 0);
+	}
+
+	/*
+	 * If we are doing a checkpoint, mark everything since the
+	 * last checkpoint as no longer ACTIVE.
+	 */
+	if (do_ckp || fs->lfs_doifile) {
+		segleft = fs->lfs_nseg;
+		curseg = 0;
+		for (n = 0; n < fs->lfs_segtabsz; n++) {
+			dirty = 0;
+			if (bread(fs->lfs_ivnode, fs->lfs_cleansz + n,
+			    fs->lfs_bsize, NOCRED, B_MODIFY, &bp))
+				panic("lfs_segwrite: ifile read");
+			segusep = (SEGUSE *)bp->b_data;
+			maxseg = min(segleft, fs->lfs_sepb);
+			for (i = 0; i < maxseg; i++) {
+				sn = curseg + i;
+				if (sn != dtosn(fs, fs->lfs_curseg) &&
+				    segusep->su_flags & SEGUSE_ACTIVE) {
+					segusep->su_flags &= ~SEGUSE_ACTIVE;
+					--fs->lfs_nactive;
+					++dirty;
+				}
+				fs->lfs_suflags[fs->lfs_activesb][sn] =
+					segusep->su_flags;
+				if (fs->lfs_version > 1)
+					++segusep;
+				else
+					segusep = (SEGUSE *)
+						((SEGUSE_V1 *)segusep + 1);
+			}
+
+			if (dirty)
+				error = LFS_BWRITE_LOG(bp); /* Ifile */
+			else
+				brelse(bp, 0);
+			segleft -= fs->lfs_sepb;
+			curseg += fs->lfs_sepb;
+		}
+	}
+
+	KASSERT(LFS_SEGLOCK_HELD(fs));
+
+	did_ckp = 0;
+	if (do_ckp || fs->lfs_doifile) {
+		vp = fs->lfs_ivnode;
+		vn_lock(vp, LK_EXCLUSIVE);
+		loopcount = 0;
+		do {
+#ifdef DEBUG
+			LFS_ENTER_LOG("pretend", __FILE__, __LINE__, 0, 0, curproc->p_pid);
+#endif
+			mutex_enter(&lfs_lock);
+			fs->lfs_flags &= ~LFS_IFDIRTY;
+			mutex_exit(&lfs_lock);
+
+			ip = VTOI(vp);
+
+			if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
+				/*
+				 * Ifile has no pages, so we don't need
+				 * to check error return here.
+				 */
+				lfs_writefile(fs, sp, vp);
+				/*
+				 * Ensure the Ifile takes the current segment
+				 * into account.  See comment in lfs_vflush.
+				 */
+				lfs_writefile(fs, sp, vp);
+				lfs_writefile(fs, sp, vp);
+			}
+
+			if (ip->i_flag & IN_ALLMOD)
+				++did_ckp;
+#if 0
+			redo = (do_ckp ? lfs_writeinode(fs, sp, ip) : 0);
+#else
+			redo = lfs_writeinode(fs, sp, ip);
+#endif
+			redo += lfs_writeseg(fs, sp);
+			mutex_enter(&lfs_lock);
+			redo += (fs->lfs_flags & LFS_IFDIRTY);
+			mutex_exit(&lfs_lock);
+#ifdef DEBUG
+			if (++loopcount > 2)
+				log(LOG_NOTICE, "lfs_segwrite: looping count=%d\n",
+					loopcount);
+#endif
+		} while (redo && do_ckp);
+
+		/*
+		 * Unless we are unmounting, the Ifile may continue to have
+		 * dirty blocks even after a checkpoint, due to changes to
+		 * inodes' atime.  If we're checkpointing, it's "impossible"
+		 * for other parts of the Ifile to be dirty after the loop
+		 * above, since we hold the segment lock.
+		 */
+		mutex_enter(vp->v_interlock);
+		if (LIST_EMPTY(&vp->v_dirtyblkhd)) {
+			LFS_CLR_UINO(ip, IN_ALLMOD);
+		}
+#ifdef DIAGNOSTIC
+		else if (do_ckp) {
+			int do_panic = 0;
+			LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
+				if (bp->b_lblkno < fs->lfs_cleansz +
+				    fs->lfs_segtabsz &&
+				    !(bp->b_flags & B_GATHERED)) {
+					printf("ifile lbn %ld still dirty (flags %lx)\n",
+						(long)bp->b_lblkno,
+						(long)bp->b_flags);
+					++do_panic;
+				}
+			}
+			if (do_panic)
+				panic("dirty blocks");
+		}
+#endif
+		mutex_exit(vp->v_interlock);
+		VOP_UNLOCK(vp);
+	} else {
+		(void) lfs_writeseg(fs, sp);
+	}
+
+	/* Note Ifile no longer needs to be written */
+	fs->lfs_doifile = 0;
+	if (writer_set)
+		lfs_writer_leave(fs);
+
+	/*
+	 * If we didn't write the Ifile, we didn't really do anything.
+	 * That means that (1) there is a checkpoint on disk and (2)
+	 * nothing has changed since it was written.
+	 *
+	 * Take the flags off of the segment so that lfs_segunlock
+	 * doesn't have to write the superblock either.
+	 */
+	if (do_ckp && !did_ckp) {
+		sp->seg_flags &= ~SEGM_CKP;
+	}
+
+	if (lfs_dostats) {
+		++lfs_stats.nwrites;
+		if (sp->seg_flags & SEGM_SYNC)
+			++lfs_stats.nsync_writes;
+		if (sp->seg_flags & SEGM_CKP)
+			++lfs_stats.ncheckpoints;
+	}
+	lfs_segunlock(fs);
+	return (0);
+}
+
+/*
+ * Write the dirty blocks associated with a vnode.
+ */
+int
+lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp)
+{
+	struct finfo *fip;
+	struct inode *ip;
+	int i, frag;
+	int error;
+
+	ASSERT_SEGLOCK(fs);
+	error = 0;
+	ip = VTOI(vp);
+
+	fip = sp->fip;
+	lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
+
+	if (vp->v_uflag & VU_DIROP)
+		((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
+
+	if (sp->seg_flags & SEGM_CLEAN) {
+		lfs_gather(fs, sp, vp, lfs_match_fake);
+		/*
+		 * For a file being flushed, we need to write *all* blocks.
+		 * This means writing the cleaning blocks first, and then
+		 * immediately following with any non-cleaning blocks.
+		 * The same is true of the Ifile since checkpoints assume
+		 * that all valid Ifile blocks are written.
+		 */
+		if (IS_FLUSHING(fs, vp) || vp == fs->lfs_ivnode) {
+			lfs_gather(fs, sp, vp, lfs_match_data);
+			/*
+			 * Don't call VOP_PUTPAGES: if we're flushing,
+			 * we've already done it, and the Ifile doesn't
+			 * use the page cache.
+			 */
+		}
+	} else {
+		lfs_gather(fs, sp, vp, lfs_match_data);
+		/*
+		 * If we're flushing, we've already called VOP_PUTPAGES
+		 * so don't do it again.  Otherwise, we want to write
+		 * everything we've got.
+		 */
+		if (!IS_FLUSHING(fs, vp)) {
+			mutex_enter(vp->v_interlock);
+			error = VOP_PUTPAGES(vp, 0, 0,
+				PGO_CLEANIT | PGO_ALLPAGES | PGO_LOCKED);
+		}
+	}
+
+	/*
+	 * It may not be necessary to write the meta-data blocks at this point,
+	 * as the roll-forward recovery code should be able to reconstruct the
+	 * list.
+	 *
+	 * We have to write them anyway, though, under two conditions: (1) the
+	 * vnode is being flushed (for reuse by vinvalbuf); or (2) we are
+	 * checkpointing.
+	 *
+	 * BUT if we are cleaning, we might have indirect blocks that refer to
+	 * new blocks not being written yet, in addition to fragments being
+	 * moved out of a cleaned segment.  If that is the case, don't
+	 * write the indirect blocks, or the finfo will have a small block
+	 * in the middle of it!
+	 * XXX in this case isn't the inode size wrong too?
+	 */
+	frag = 0;
+	if (sp->seg_flags & SEGM_CLEAN) {
+		for (i = 0; i < NDADDR; i++)
+			if (ip->i_lfs_fragsize[i] > 0 &&
+			    ip->i_lfs_fragsize[i] < fs->lfs_bsize)
+				++frag;
+	}
+#ifdef DIAGNOSTIC
+	if (frag > 1)
+		panic("lfs_writefile: more than one fragment!");
+#endif
+	if (IS_FLUSHING(fs, vp) ||
+	    (frag == 0 && (lfs_writeindir || (sp->seg_flags & SEGM_CKP)))) {
+		lfs_gather(fs, sp, vp, lfs_match_indir);
+		lfs_gather(fs, sp, vp, lfs_match_dindir);
+		lfs_gather(fs, sp, vp, lfs_match_tindir);
+	}
+	fip = sp->fip;
+	lfs_release_finfo(fs);
+
+	return error;
+}
+
+/*
+ * Update segment accounting to reflect this inode's change of address.
+ */
+static int
+lfs_update_iaddr(struct lfs *fs, struct segment *sp, struct inode *ip, daddr_t ndaddr)
+{
+	struct buf *bp;
+	daddr_t daddr;
+	IFILE *ifp;
+	SEGUSE *sup;
+	ino_t ino;
+	int redo_ifile, error;
+	u_int32_t sn;
+
+	redo_ifile = 0;
+
+	/*
+	 * If updating the ifile, update the super-block.  Update the disk
+	 * address and access times for this inode in the ifile.
+	 */
+	ino = ip->i_number;
+	if (ino == LFS_IFILE_INUM) {
+		daddr = fs->lfs_idaddr;
+		fs->lfs_idaddr = dbtofsb(fs, ndaddr);
+	} else {
+		LFS_IENTRY(ifp, fs, ino, bp);
+		daddr = ifp->if_daddr;
+		ifp->if_daddr = dbtofsb(fs, ndaddr);
+		error = LFS_BWRITE_LOG(bp); /* Ifile */
+	}
+
+	/*
+	 * If this is the Ifile and lfs_offset is set to the first block
+	 * in the segment, dirty the new segment's accounting block
+	 * (XXX should already be dirty?) and tell the caller to do it again.
+	 */
+	if (ip->i_number == LFS_IFILE_INUM) {
+		sn = dtosn(fs, fs->lfs_offset);
+		if (sntod(fs, sn) + btofsb(fs, fs->lfs_sumsize) ==
+		    fs->lfs_offset) {
+			LFS_SEGENTRY(sup, fs, sn, bp);
+			KASSERT(bp->b_oflags & BO_DELWRI);
+			LFS_WRITESEGENTRY(sup, fs, sn, bp);
+			/* fs->lfs_flags |= LFS_IFDIRTY; */
+			redo_ifile |= 1;
+		}
+	}
+
+	/*
+	 * The inode's last address should not be in the current partial
+	 * segment, except under exceptional circumstances (lfs_writevnodes
+	 * had to start over, and in the meantime more blocks were written
+	 * to a vnode).	 Both inodes will be accounted to this segment
+	 * in lfs_writeseg so we need to subtract the earlier version
+	 * here anyway.	 The segment count can temporarily dip below
+	 * zero here; keep track of how many duplicates we have in
+	 * "dupino" so we don't panic below.
+	 */
+	if (daddr >= fs->lfs_lastpseg && daddr <= fs->lfs_offset) {
+		++sp->ndupino;
+		DLOG((DLOG_SEG, "lfs_writeinode: last inode addr in current pseg "
+		      "(ino %d daddr 0x%llx) ndupino=%d\n", ino,
+		      (long long)daddr, sp->ndupino));
+	}
+	/*
+	 * Account the inode: it no longer belongs to its former segment,
+	 * though it will not belong to the new segment until that segment
+	 * is actually written.
+	 */
+	if (daddr != LFS_UNUSED_DADDR) {
+		u_int32_t oldsn = dtosn(fs, daddr);
+#ifdef DIAGNOSTIC
+		int ndupino = (sp->seg_number == oldsn) ? sp->ndupino : 0;
+#endif
+		LFS_SEGENTRY(sup, fs, oldsn, bp);
+#ifdef DIAGNOSTIC
+		if (sup->su_nbytes +
+		    sizeof (struct ufs1_dinode) * ndupino
+		      < sizeof (struct ufs1_dinode)) {
+			printf("lfs_writeinode: negative bytes "
+			       "(segment %" PRIu32 " short by %d, "
+			       "oldsn=%" PRIu32 ", cursn=%" PRIu32
+			       ", daddr=%" PRId64 ", su_nbytes=%u, "
+			       "ndupino=%d)\n",
+			       dtosn(fs, daddr),
+			       (int)sizeof (struct ufs1_dinode) *
+				   (1 - sp->ndupino) - sup->su_nbytes,
+			       oldsn, sp->seg_number, daddr,
+			       (unsigned int)sup->su_nbytes,
+			       sp->ndupino);
+			panic("lfs_writeinode: negative bytes");
+			sup->su_nbytes = sizeof (struct ufs1_dinode);
+		}
+#endif
+		DLOG((DLOG_SU, "seg %d -= %d for ino %d inode\n",
+		      dtosn(fs, daddr), sizeof (struct ufs1_dinode), ino));
+		sup->su_nbytes -= sizeof (struct ufs1_dinode);
+		redo_ifile |=
+			(ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED));
+		if (redo_ifile) {
+			mutex_enter(&lfs_lock);
+			fs->lfs_flags |= LFS_IFDIRTY;
+			mutex_exit(&lfs_lock);
+			/* Don't double-account */
+			fs->lfs_idaddr = 0x0;
+		}
+		LFS_WRITESEGENTRY(sup, fs, oldsn, bp); /* Ifile */
+	}
+
+	return redo_ifile;
+}
+
+int
+lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
+{
+	struct buf *bp;
+	struct ufs1_dinode *cdp;
+	daddr_t daddr;
+	int32_t *daddrp;	/* XXX ondisk32 */
+	int i, ndx;
+	int redo_ifile = 0;
+	int gotblk = 0;
+	int count;
+
+	ASSERT_SEGLOCK(fs);
+	if (!(ip->i_flag & IN_ALLMOD))
+		return (0);
+
+	/* Can't write ifile when writer is not set */
+	KASSERT(ip->i_number != LFS_IFILE_INUM || fs->lfs_writer > 0 ||
+		(sp->seg_flags & SEGM_CLEAN));
+
+	/*
+	 * If this is the Ifile, see if writing it here will generate a
+	 * temporary misaccounting.  If it will, do the accounting and write
+	 * the blocks, postponing the inode write until the accounting is
+	 * solid.
+	 */
+	count = 0;
+	while (ip->i_number == LFS_IFILE_INUM) {
+		int redo = 0;
+
+		if (sp->idp == NULL && sp->ibp == NULL &&
+		    (sp->seg_bytes_left < fs->lfs_ibsize ||
+		     sp->sum_bytes_left < sizeof(int32_t))) {
+			(void) lfs_writeseg(fs, sp);
+			continue;
+		}
+
+		/* Look for dirty Ifile blocks */
+		LIST_FOREACH(bp, &fs->lfs_ivnode->v_dirtyblkhd, b_vnbufs) {
+			if (!(bp->b_flags & B_GATHERED)) {
+				redo = 1;
+				break;
+			}
+		}
+
+		if (redo == 0)
+			redo = lfs_update_iaddr(fs, sp, ip, 0x0);
+		if (redo == 0)
+			break;
+
+		if (sp->idp) {
+			sp->idp->di_inumber = 0;
+			sp->idp = NULL;
+		}
+		++count;
+		if (count > 2)
+			log(LOG_NOTICE, "lfs_writeinode: looping count=%d\n", count);
+		lfs_writefile(fs, sp, fs->lfs_ivnode);
+	}
+
+	/* Allocate a new inode block if necessary. */
+	if ((ip->i_number != LFS_IFILE_INUM || sp->idp == NULL) &&
+	    sp->ibp == NULL) {
+		/* Allocate a new segment if necessary. */
+		if (sp->seg_bytes_left < fs->lfs_ibsize ||
+		    sp->sum_bytes_left < sizeof(int32_t))
+			(void) lfs_writeseg(fs, sp);
+
+		/* Get next inode block. */
+		daddr = fs->lfs_offset;
+		fs->lfs_offset += btofsb(fs, fs->lfs_ibsize);
+		sp->ibp = *sp->cbpp++ =
+			getblk(VTOI(fs->lfs_ivnode)->i_devvp,
+			    fsbtodb(fs, daddr), fs->lfs_ibsize, 0, 0);
+		gotblk++;
+
+		/* Zero out inode numbers */
+		for (i = 0; i < INOPB(fs); ++i)
+			((struct ufs1_dinode *)sp->ibp->b_data)[i].di_inumber =
+			    0;
+
+		++sp->start_bpp;
+		fs->lfs_avail -= btofsb(fs, fs->lfs_ibsize);
+		/* Set remaining space counters. */
+		sp->seg_bytes_left -= fs->lfs_ibsize;
+		sp->sum_bytes_left -= sizeof(int32_t);
+		ndx = fs->lfs_sumsize / sizeof(int32_t) -
+			sp->ninodes / INOPB(fs) - 1;
+		((int32_t *)(sp->segsum))[ndx] = daddr;
+	}
+
+	/* Check VU_DIROP in case there is a new file with no data blocks */
+	if (ITOV(ip)->v_uflag & VU_DIROP)
+		((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
+
+	/* Update the inode times and copy the inode onto the inode page. */
+	/* XXX kludge --- don't redirty the ifile just to put times on it */
+	if (ip->i_number != LFS_IFILE_INUM)
+		LFS_ITIMES(ip, NULL, NULL, NULL);
+
+	/*
+	 * If this is the Ifile, and we've already written the Ifile in this
+	 * partial segment, just overwrite it (it's not on disk yet) and
+	 * continue.
+	 *
+	 * XXX we know that the bp that we get the second time around has
+	 * already been gathered.
+	 */
+	if (ip->i_number == LFS_IFILE_INUM && sp->idp) {
+		*(sp->idp) = *ip->i_din.ffs1_din;
+		ip->i_lfs_osize = ip->i_size;
+		return 0;
+	}
+
+	bp = sp->ibp;
+	cdp = ((struct ufs1_dinode *)bp->b_data) + (sp->ninodes % INOPB(fs));
+	*cdp = *ip->i_din.ffs1_din;
+
+	/*
+	 * If cleaning, link counts and directory file sizes cannot change,
+	 * since those would be directory operations---even if the file
+	 * we are writing is marked VU_DIROP we should write the old values.
+	 * If we're not cleaning, of course, update the values so we get
+	 * current values the next time we clean.
+	 */
+	if (sp->seg_flags & SEGM_CLEAN) {
+		if (ITOV(ip)->v_uflag & VU_DIROP) {
+			cdp->di_nlink = ip->i_lfs_odnlink;
+			/* if (ITOV(ip)->v_type == VDIR) */
+			cdp->di_size = ip->i_lfs_osize;
+		}
+	} else {
+		ip->i_lfs_odnlink = cdp->di_nlink;
+		ip->i_lfs_osize = ip->i_size;
+	}
+		
+
+	/* We can finish the segment accounting for truncations now */
+	lfs_finalize_ino_seguse(fs, ip);
+
+	/*
+	 * If we are cleaning, ensure that we don't write UNWRITTEN disk
+	 * addresses to disk; possibly change the on-disk record of
+	 * the inode size, either by reverting to the previous size
+	 * (in the case of cleaning) or by verifying the inode's block
+	 * holdings (in the case of files being allocated as they are being
+	 * written).
+	 * XXX By not writing UNWRITTEN blocks, we are making the lfs_avail
+	 * XXX count on disk wrong by the same amount.	We should be
+	 * XXX able to "borrow" from lfs_avail and return it after the
+	 * XXX Ifile is written.  See also in lfs_writeseg.
+	 */
+
+	/* Check file size based on highest allocated block */
+	if (((ip->i_ffs1_mode & IFMT) == IFREG ||
+	     (ip->i_ffs1_mode & IFMT) == IFDIR) &&
+	    ip->i_size > ((ip->i_lfs_hiblk + 1) << fs->lfs_bshift)) {
+		cdp->di_size = (ip->i_lfs_hiblk + 1) << fs->lfs_bshift;
+		DLOG((DLOG_SEG, "lfs_writeinode: ino %d size %" PRId64 " -> %"
+		      PRId64 "\n", (int)ip->i_number, ip->i_size, cdp->di_size));
+	}
+	if (ip->i_lfs_effnblks != ip->i_ffs1_blocks) {
+		DLOG((DLOG_SEG, "lfs_writeinode: cleansing ino %d eff %d != nblk %d)"
+		      " at %x\n", ip->i_number, ip->i_lfs_effnblks,
+		      ip->i_ffs1_blocks, fs->lfs_offset));
+		for (daddrp = cdp->di_db; daddrp < cdp->di_ib + NIADDR;
+		     daddrp++) {
+			if (*daddrp == UNWRITTEN) {
+				DLOG((DLOG_SEG, "lfs_writeinode: wiping UNWRITTEN\n"));
+				*daddrp = 0;
+			}
+		}
+	}
+
+#ifdef DIAGNOSTIC
+	/*
+	 * Check dinode held blocks against dinode size.
+	 * This should be identical to the check in lfs_vget().
+	 */
+	for (i = (cdp->di_size + fs->lfs_bsize - 1) >> fs->lfs_bshift;
+	     i < NDADDR; i++) {
+		KASSERT(i >= 0);
+		if ((cdp->di_mode & IFMT) == IFLNK)
+			continue;
+		if (((cdp->di_mode & IFMT) == IFBLK ||
+		     (cdp->di_mode & IFMT) == IFCHR) && i == 0)
+			continue;
+		if (cdp->di_db[i] != 0) {
+# ifdef DEBUG
+			lfs_dump_dinode(cdp);
+# endif
+			panic("writing inconsistent inode");
+		}
+	}
+#endif /* DIAGNOSTIC */
+
+	if (ip->i_flag & IN_CLEANING)
+		LFS_CLR_UINO(ip, IN_CLEANING);
+	else {
+		/* XXX IN_ALLMOD */
+		LFS_CLR_UINO(ip, IN_ACCESSED | IN_ACCESS | IN_CHANGE |
+			     IN_UPDATE | IN_MODIFY);
+		if (ip->i_lfs_effnblks == ip->i_ffs1_blocks)
+			LFS_CLR_UINO(ip, IN_MODIFIED);
+		else {
+			DLOG((DLOG_VNODE, "lfs_writeinode: ino %d: real "
+			    "blks=%d, eff=%d\n", ip->i_number,
+			    ip->i_ffs1_blocks, ip->i_lfs_effnblks));
+		}
+	}
+
+	if (ip->i_number == LFS_IFILE_INUM) {
+		/* We know sp->idp == NULL */
+		sp->idp = ((struct ufs1_dinode *)bp->b_data) +
+			(sp->ninodes % INOPB(fs));
+
+		/* Not dirty any more */
+		mutex_enter(&lfs_lock);
+		fs->lfs_flags &= ~LFS_IFDIRTY;
+		mutex_exit(&lfs_lock);
+	}
+
+	if (gotblk) {
+		mutex_enter(&bufcache_lock);
+		LFS_LOCK_BUF(bp);
+		brelsel(bp, 0);
+		mutex_exit(&bufcache_lock);
+	}
+
+	/* Increment inode count in segment summary block. */
+	++((SEGSUM *)(sp->segsum))->ss_ninos;
+
+	/* If this page is full, set flag to allocate a new page. */
+	if (++sp->ninodes % INOPB(fs) == 0)
+		sp->ibp = NULL;
+
+	redo_ifile = lfs_update_iaddr(fs, sp, ip, bp->b_blkno);
+
+	KASSERT(redo_ifile == 0);
+	return (redo_ifile);
+}
+
+int
+lfs_gatherblock(struct segment *sp, struct buf *bp, kmutex_t *mptr)
+{
+	struct lfs *fs;
+	int vers;
+	int j, blksinblk;
+
+	ASSERT_SEGLOCK(sp->fs);
+	/*
+	 * If full, finish this segment.  We may be doing I/O, so
+	 * release and reacquire the splbio().
+	 */
+#ifdef DIAGNOSTIC
+	if (sp->vp == NULL)
+		panic ("lfs_gatherblock: Null vp in segment");
+#endif
+	fs = sp->fs;
+	blksinblk = howmany(bp->b_bcount, fs->lfs_bsize);
+	if (sp->sum_bytes_left < sizeof(int32_t) * blksinblk ||
+	    sp->seg_bytes_left < bp->b_bcount) {
+		if (mptr)
+			mutex_exit(mptr);
+		lfs_updatemeta(sp);
+
+		vers = sp->fip->fi_version;
+		(void) lfs_writeseg(fs, sp);
+
+		/* Add the current file to the segment summary. */
+		lfs_acquire_finfo(fs, VTOI(sp->vp)->i_number, vers);
+
+		if (mptr)
+			mutex_enter(mptr);
+		return (1);
+	}
+
+	if (bp->b_flags & B_GATHERED) {
+		DLOG((DLOG_SEG, "lfs_gatherblock: already gathered! Ino %d,"
+		      " lbn %" PRId64 "\n",
+		      sp->fip->fi_ino, bp->b_lblkno));
+		return (0);
+	}
+
+	/* Insert into the buffer list, update the FINFO block. */
+	bp->b_flags |= B_GATHERED;
+
+	*sp->cbpp++ = bp;
+	for (j = 0; j < blksinblk; j++) {
+		sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno + j;
+		/* This block's accounting moves from lfs_favail to lfs_avail */
+		lfs_deregister_block(sp->vp, bp->b_lblkno + j);
+	}
+
+	sp->sum_bytes_left -= sizeof(int32_t) * blksinblk;
+	sp->seg_bytes_left -= bp->b_bcount;
+	return (0);
+}
+
+int
+lfs_gather(struct lfs *fs, struct segment *sp, struct vnode *vp,
+    int (*match)(struct lfs *, struct buf *))
+{
+	struct buf *bp, *nbp;
+	int count = 0;
+
+	ASSERT_SEGLOCK(fs);
+	if (vp->v_type == VBLK)
+		return 0;
+	KASSERT(sp->vp == NULL);
+	sp->vp = vp;
+	mutex_enter(&bufcache_lock);
+
+#ifndef LFS_NO_BACKBUF_HACK
+/* This is a hack to see if ordering the blocks in LFS makes a difference. */
+# define	BUF_OFFSET	\
+	(((char *)&LIST_NEXT(bp, b_vnbufs)) - (char *)bp)
+# define	BACK_BUF(BP)	\
+	((struct buf *)(((char *)(BP)->b_vnbufs.le_prev) - BUF_OFFSET))
+# define	BEG_OF_LIST	\
+	((struct buf *)(((char *)&LIST_FIRST(&vp->v_dirtyblkhd)) - BUF_OFFSET))
+
+loop:
+	/* Find last buffer. */
+	for (bp = LIST_FIRST(&vp->v_dirtyblkhd);
+	     bp && LIST_NEXT(bp, b_vnbufs) != NULL;
+	     bp = LIST_NEXT(bp, b_vnbufs))
+		/* nothing */;
+	for (; bp && bp != BEG_OF_LIST; bp = nbp) {
+		nbp = BACK_BUF(bp);
+#else /* LFS_NO_BACKBUF_HACK */
+loop:
+	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+		nbp = LIST_NEXT(bp, b_vnbufs);
+#endif /* LFS_NO_BACKBUF_HACK */
+		if ((bp->b_cflags & BC_BUSY) != 0 ||
+		    (bp->b_flags & B_GATHERED) != 0 || !match(fs, bp)) {
+#ifdef DEBUG
+			if (vp == fs->lfs_ivnode &&
+			    (bp->b_cflags & BC_BUSY) != 0 &&
+			    (bp->b_flags & B_GATHERED) == 0)
+				log(LOG_NOTICE, "lfs_gather: ifile lbn %"
+				      PRId64 " busy (%x) at 0x%x",
+				      bp->b_lblkno, bp->b_flags,
+				      (unsigned)fs->lfs_offset);
+#endif
+			continue;
+		}
+#ifdef DIAGNOSTIC
+# ifdef LFS_USE_B_INVAL
+		if ((bp->b_flags & BC_INVAL) != 0 && bp->b_iodone == NULL) {
+			DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64
+			      " is BC_INVAL\n", bp->b_lblkno));
+			VOP_PRINT(bp->b_vp);
+		}
+# endif /* LFS_USE_B_INVAL */
+		if (!(bp->b_oflags & BO_DELWRI))
+			panic("lfs_gather: bp not BO_DELWRI");
+		if (!(bp->b_flags & B_LOCKED)) {
+			DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64
+			      " blk %" PRId64 " not B_LOCKED\n",
+			      bp->b_lblkno,
+			      dbtofsb(fs, bp->b_blkno)));
+			VOP_PRINT(bp->b_vp);
+			panic("lfs_gather: bp not B_LOCKED");
+		}
+#endif
+		if (lfs_gatherblock(sp, bp, &bufcache_lock)) {
+			goto loop;
+		}
+		count++;
+	}
+	mutex_exit(&bufcache_lock);
+	lfs_updatemeta(sp);
+	KASSERT(sp->vp == vp);
+	sp->vp = NULL;
+	return count;
+}
+
+#if DEBUG
+# define DEBUG_OOFF(n) do {						\
+	if (ooff == 0) {						\
+		DLOG((DLOG_SEG, "lfs_updatemeta[%d]: warning: writing " \
+			"ino %d lbn %" PRId64 " at 0x%" PRIx32		\
+			", was 0x0 (or %" PRId64 ")\n",			\
+			(n), ip->i_number, lbn, ndaddr, daddr));	\
+	}								\
+} while (0)
+#else
+# define DEBUG_OOFF(n)
+#endif
+
+/*
+ * Change the given block's address to ndaddr, finding its previous
+ * location using ufs_bmaparray().
+ *
+ * Account for this change in the segment table.
+ *
+ * called with sp == NULL by roll-forwarding code.
+ */
+void
+lfs_update_single(struct lfs *fs, struct segment *sp,
+    struct vnode *vp, daddr_t lbn, int32_t ndaddr, int size)
+{
+	SEGUSE *sup;
+	struct buf *bp;
+	struct indir a[NIADDR + 2], *ap;
+	struct inode *ip;
+	daddr_t daddr, ooff;
+	int num, error;
+	int bb, osize, obb;
+
+	ASSERT_SEGLOCK(fs);
+	KASSERT(sp == NULL || sp->vp == vp);
+	ip = VTOI(vp);
+
+	error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL, NULL);
+	if (error)
+		panic("lfs_updatemeta: ufs_bmaparray returned %d", error);
+
+	daddr = (daddr_t)((int32_t)daddr); /* XXX ondisk32 */
+	KASSERT(daddr <= LFS_MAX_DADDR);
+	if (daddr > 0)
+		daddr = dbtofsb(fs, daddr);
+
+	bb = numfrags(fs, size);
+	switch (num) {
+	    case 0:
+		    ooff = ip->i_ffs1_db[lbn];
+		    DEBUG_OOFF(0);
+		    if (ooff == UNWRITTEN)
+			    ip->i_ffs1_blocks += bb;
+		    else {
+			    /* possible fragment truncation or extension */
+			    obb = btofsb(fs, ip->i_lfs_fragsize[lbn]);
+			    ip->i_ffs1_blocks += (bb - obb);
+		    }
+		    ip->i_ffs1_db[lbn] = ndaddr;
+		    break;
+	    case 1:
+		    ooff = ip->i_ffs1_ib[a[0].in_off];
+		    DEBUG_OOFF(1);
+		    if (ooff == UNWRITTEN)
+			    ip->i_ffs1_blocks += bb;
+		    ip->i_ffs1_ib[a[0].in_off] = ndaddr;
+		    break;
+	    default:
+		    ap = &a[num - 1];
+		    if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED,
+			B_MODIFY, &bp))
+			    panic("lfs_updatemeta: bread bno %" PRId64,
+				  ap->in_lbn);
+
+		    /* XXX ondisk32 */
+		    ooff = ((int32_t *)bp->b_data)[ap->in_off];
+		    DEBUG_OOFF(num);
+		    if (ooff == UNWRITTEN)
+			    ip->i_ffs1_blocks += bb;
+		    /* XXX ondisk32 */
+		    ((int32_t *)bp->b_data)[ap->in_off] = ndaddr;
+		    (void) VOP_BWRITE(bp->b_vp, bp);
+	}
+
+	KASSERT(ooff == 0 || ooff == UNWRITTEN || ooff == daddr);
+
+	/* Update hiblk when extending the file */
+	if (lbn > ip->i_lfs_hiblk)
+		ip->i_lfs_hiblk = lbn;
+
+	/*
+	 * Though we'd rather it couldn't, this *can* happen right now
+	 * if cleaning blocks and regular blocks coexist.
+	 */
+	/* KASSERT(daddr < fs->lfs_lastpseg || daddr > ndaddr); */
+
+	/*
+	 * Update segment usage information, based on old size
+	 * and location.
+	 */
+	if (daddr > 0) {
+		u_int32_t oldsn = dtosn(fs, daddr);
+#ifdef DIAGNOSTIC
+		int ndupino;
+
+		if (sp && sp->seg_number == oldsn) {
+			ndupino = sp->ndupino;
+		} else {
+			ndupino = 0;
+		}
+#endif
+		KASSERT(oldsn < fs->lfs_nseg);
+		if (lbn >= 0 && lbn < NDADDR)
+			osize = ip->i_lfs_fragsize[lbn];
+		else
+			osize = fs->lfs_bsize;
+		LFS_SEGENTRY(sup, fs, oldsn, bp);
+#ifdef DIAGNOSTIC
+		if (sup->su_nbytes + sizeof (struct ufs1_dinode) * ndupino
+		    < osize) {
+			printf("lfs_updatemeta: negative bytes "
+			       "(segment %" PRIu32 " short by %" PRId64
+			       ")\n", dtosn(fs, daddr),
+			       (int64_t)osize -
+			       (sizeof (struct ufs1_dinode) * ndupino +
+				sup->su_nbytes));
+			printf("lfs_updatemeta: ino %llu, lbn %" PRId64
+			       ", addr = 0x%" PRIx64 "\n",
+			       (unsigned long long)ip->i_number, lbn, daddr);
+			printf("lfs_updatemeta: ndupino=%d\n", ndupino);
+			panic("lfs_updatemeta: negative bytes");
+			sup->su_nbytes = osize -
+			    sizeof (struct ufs1_dinode) * ndupino;
+		}
+#endif
+		DLOG((DLOG_SU, "seg %" PRIu32 " -= %d for ino %d lbn %" PRId64
+		      " db 0x%" PRIx64 "\n",
+		      dtosn(fs, daddr), osize,
+		      ip->i_number, lbn, daddr));
+		sup->su_nbytes -= osize;
+		if (!(bp->b_flags & B_GATHERED)) {
+			mutex_enter(&lfs_lock);
+			fs->lfs_flags |= LFS_IFDIRTY;
+			mutex_exit(&lfs_lock);
+		}
+		LFS_WRITESEGENTRY(sup, fs, oldsn, bp);
+	}
+	/*
+	 * Now that this block has a new address, and its old
+	 * segment no longer owns it, we can forget about its
+	 * old size.
+	 */
+	if (lbn >= 0 && lbn < NDADDR)
+		ip->i_lfs_fragsize[lbn] = size;
+}
+
+/*
+ * Update the metadata that points to the blocks listed in the FINFO
+ * array.
+ */
+void
+lfs_updatemeta(struct segment *sp)
+{
+	struct buf *sbp;
+	struct lfs *fs;
+	struct vnode *vp;
+	daddr_t lbn;
+	int i, nblocks, num;
+	int bb;
+	int bytesleft, size;
+
+	ASSERT_SEGLOCK(sp->fs);
+	vp = sp->vp;
+	nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp;
+	KASSERT(nblocks >= 0);
+	KASSERT(vp != NULL);
+	if (nblocks == 0)
+		return;
+
+	/*
+	 * This count may be high due to oversize blocks from lfs_gop_write.
+	 * Correct for this. (XXX we should be able to keep track of these.)
+	 */
+	fs = sp->fs;
+	for (i = 0; i < nblocks; i++) {
+		if (sp->start_bpp[i] == NULL) {
+			DLOG((DLOG_SEG, "lfs_updatemeta: nblocks = %d, not %d\n", i, nblocks));
+			nblocks = i;
+			break;
+		}
+		num = howmany(sp->start_bpp[i]->b_bcount, fs->lfs_bsize);
+		KASSERT(sp->start_bpp[i]->b_lblkno >= 0 || num == 1);
+		nblocks -= num - 1;
+	}
+
+	KASSERT(vp->v_type == VREG ||
+	   nblocks == &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp);
+	KASSERT(nblocks == sp->cbpp - sp->start_bpp);
+
+	/*
+	 * Sort the blocks.
+	 *
+	 * We have to sort even if the blocks come from the
+	 * cleaner, because there might be other pending blocks on the
+	 * same inode...and if we don't sort, and there are fragments
+	 * present, blocks may be written in the wrong place.
+	 */
+	lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks, fs->lfs_bsize);
+
+	/*
+	 * Record the length of the last block in case it's a fragment.
+	 * If there are indirect blocks present, they sort last.  An
+	 * indirect block will be lfs_bsize and its presence indicates
+	 * that you cannot have fragments.
+	 *
+	 * XXX This last is a lie.  A cleaned fragment can coexist with
+	 * XXX a later indirect block.	This will continue to be
+	 * XXX true until lfs_markv is fixed to do everything with
+	 * XXX fake blocks (including fake inodes and fake indirect blocks).
+	 */
+	sp->fip->fi_lastlength = ((sp->start_bpp[nblocks - 1]->b_bcount - 1) &
+		fs->lfs_bmask) + 1;
+
+	/*
+	 * Assign disk addresses, and update references to the logical
+	 * block and the segment usage information.
+	 */
+	for (i = nblocks; i--; ++sp->start_bpp) {
+		sbp = *sp->start_bpp;
+		lbn = *sp->start_lbp;
+		KASSERT(sbp->b_lblkno == lbn);
+
+		sbp->b_blkno = fsbtodb(fs, fs->lfs_offset);
+
+		/*
+		 * If we write a frag in the wrong place, the cleaner won't
+		 * be able to correctly identify its size later, and the
+		 * segment will be uncleanable.	 (Even worse, it will assume
+		 * that the indirect block that actually ends the list
+		 * is of a smaller size!)
+		 */
+		if ((sbp->b_bcount & fs->lfs_bmask) && i != 0)
+			panic("lfs_updatemeta: fragment is not last block");
+
+		/*
+		 * For each subblock in this possibly oversized block,
+		 * update its address on disk.
+		 */
+		KASSERT(lbn >= 0 || sbp->b_bcount == fs->lfs_bsize);
+		KASSERT(vp == sbp->b_vp);
+		for (bytesleft = sbp->b_bcount; bytesleft > 0;
+		     bytesleft -= fs->lfs_bsize) {
+			size = MIN(bytesleft, fs->lfs_bsize);
+			bb = numfrags(fs, size);
+			lbn = *sp->start_lbp++;
+			lfs_update_single(fs, sp, sp->vp, lbn, fs->lfs_offset,
+			    size);
+			fs->lfs_offset += bb;
+		}
+
+	}
+
+	/* This inode has been modified */
+	LFS_SET_UINO(VTOI(vp), IN_MODIFIED);
+}
+
+/*
+ * Move lfs_offset to a segment earlier than sn.
+ */
+int
+lfs_rewind(struct lfs *fs, int newsn)
+{
+	int sn, osn, isdirty;
+	struct buf *bp;
+	SEGUSE *sup;
+
+	ASSERT_SEGLOCK(fs);
+
+	osn = dtosn(fs, fs->lfs_offset);
+	if (osn < newsn)
+		return 0;
+
+	/* lfs_avail eats the remaining space in this segment */
+	fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset - fs->lfs_curseg);
+
+	/* Find a low-numbered segment */
+	for (sn = 0; sn < fs->lfs_nseg; ++sn) {
+		LFS_SEGENTRY(sup, fs, sn, bp);
+		isdirty = sup->su_flags & SEGUSE_DIRTY;
+		brelse(bp, 0);
+
+		if (!isdirty)
+			break;
+	}
+	if (sn == fs->lfs_nseg)
+		panic("lfs_rewind: no clean segments");
+	if (newsn >= 0 && sn >= newsn)
+		return ENOENT;
+	fs->lfs_nextseg = sn;
+	lfs_newseg(fs);
+	fs->lfs_offset = fs->lfs_curseg;
+
+	return 0;
+}
+
+/*
+ * Start a new partial segment.
+ *
+ * Return 1 when we entered to a new segment.
+ * Otherwise, return 0.
+ */
+int
+lfs_initseg(struct lfs *fs)
+{
+	struct segment *sp = fs->lfs_sp;
+	SEGSUM *ssp;
+	struct buf *sbp;	/* buffer for SEGSUM */
+	int repeat = 0;		/* return value */
+
+	ASSERT_SEGLOCK(fs);
+	/* Advance to the next segment. */
+	if (!LFS_PARTIAL_FITS(fs)) {
+		SEGUSE *sup;
+		struct buf *bp;
+
+		/* lfs_avail eats the remaining space */
+		fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset -
+						   fs->lfs_curseg);
+		/* Wake up any cleaning procs waiting on this file system. */
+		lfs_wakeup_cleaner(fs);
+		lfs_newseg(fs);
+		repeat = 1;
+		fs->lfs_offset = fs->lfs_curseg;
+
+		sp->seg_number = dtosn(fs, fs->lfs_curseg);
+		sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg);
+
+		/*
+		 * If the segment contains a superblock, update the offset
+		 * and summary address to skip over it.
+		 */
+		LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
+		if (sup->su_flags & SEGUSE_SUPERBLOCK) {
+			fs->lfs_offset += btofsb(fs, LFS_SBPAD);
+			sp->seg_bytes_left -= LFS_SBPAD;
+		}
+		brelse(bp, 0);
+		/* Segment zero could also contain the labelpad */
+		if (fs->lfs_version > 1 && sp->seg_number == 0 &&
+		    fs->lfs_start < btofsb(fs, LFS_LABELPAD)) {
+			fs->lfs_offset +=
+			    btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
+			sp->seg_bytes_left -=
+			    LFS_LABELPAD - fsbtob(fs, fs->lfs_start);
+		}
+	} else {
+		sp->seg_number = dtosn(fs, fs->lfs_curseg);
+		sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg -
+				      (fs->lfs_offset - fs->lfs_curseg));
+	}
+	fs->lfs_lastpseg = fs->lfs_offset;
+
+	/* Record first address of this partial segment */
+	if (sp->seg_flags & SEGM_CLEAN) {
+		fs->lfs_cleanint[fs->lfs_cleanind] = fs->lfs_offset;
+		if (++fs->lfs_cleanind >= LFS_MAX_CLEANIND) {
+			/* "1" is the artificial inc in lfs_seglock */
+			mutex_enter(&lfs_lock);
+			while (fs->lfs_iocount > 1) {
+				mtsleep(&fs->lfs_iocount, PRIBIO + 1,
+				    "lfs_initseg", 0, &lfs_lock);
+			}
+			mutex_exit(&lfs_lock);
+			fs->lfs_cleanind = 0;
+		}
+	}
+
+	sp->fs = fs;
+	sp->ibp = NULL;
+	sp->idp = NULL;
+	sp->ninodes = 0;
+	sp->ndupino = 0;
+
+	sp->cbpp = sp->bpp;
+
+	/* Get a new buffer for SEGSUM */
+	sbp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp,
+	    fsbtodb(fs, fs->lfs_offset), fs->lfs_sumsize, LFS_NB_SUMMARY);
+
+	/* ... and enter it into the buffer list. */
+	*sp->cbpp = sbp;
+	sp->cbpp++;
+	fs->lfs_offset += btofsb(fs, fs->lfs_sumsize);
+
+	sp->start_bpp = sp->cbpp;
+
+	/* Set point to SEGSUM, initialize it. */
+	ssp = sp->segsum = sbp->b_data;
+	memset(ssp, 0, fs->lfs_sumsize);
+	ssp->ss_next = fs->lfs_nextseg;
+	ssp->ss_nfinfo = ssp->ss_ninos = 0;
+	ssp->ss_magic = SS_MAGIC;
+
+	/* Set pointer to first FINFO, initialize it. */
+	sp->fip = (struct finfo *)((char *)sp->segsum + SEGSUM_SIZE(fs));
+	sp->fip->fi_nblocks = 0;
+	sp->start_lbp = &sp->fip->fi_blocks[0];
+	sp->fip->fi_lastlength = 0;
+
+	sp->seg_bytes_left -= fs->lfs_sumsize;
+	sp->sum_bytes_left = fs->lfs_sumsize - SEGSUM_SIZE(fs);
+
+	return (repeat);
+}
+
+/*
+ * Remove SEGUSE_INVAL from all segments.
+ */
+void
+lfs_unset_inval_all(struct lfs *fs)
+{
+	SEGUSE *sup;
+	struct buf *bp;
+	int i;
+
+	for (i = 0; i < fs->lfs_nseg; i++) {
+		LFS_SEGENTRY(sup, fs, i, bp);
+		if (sup->su_flags & SEGUSE_INVAL) {
+			sup->su_flags &= ~SEGUSE_INVAL;
+			LFS_WRITESEGENTRY(sup, fs, i, bp);
+		} else
+			brelse(bp, 0);
+	}
+}
+
+/*
+ * Return the next segment to write.
+ */
+void
+lfs_newseg(struct lfs *fs)
+{
+	CLEANERINFO *cip;
+	SEGUSE *sup;
+	struct buf *bp;
+	int curseg, isdirty, sn, skip_inval;
+
+	ASSERT_SEGLOCK(fs);
+
+	/* Honor LFCNWRAPSTOP */
+	mutex_enter(&lfs_lock);
+	while (fs->lfs_nextseg < fs->lfs_curseg && fs->lfs_nowrap) {
+		if (fs->lfs_wrappass) {
+			log(LOG_NOTICE, "%s: wrappass=%d\n",
+				fs->lfs_fsmnt, fs->lfs_wrappass);
+			fs->lfs_wrappass = 0;
+			break;
+		}
+		fs->lfs_wrapstatus = LFS_WRAP_WAITING;
+		wakeup(&fs->lfs_nowrap);
+		log(LOG_NOTICE, "%s: waiting at log wrap\n", fs->lfs_fsmnt);
+		mtsleep(&fs->lfs_wrappass, PVFS, "newseg", 10 * hz,
+			&lfs_lock);
+	}
+	fs->lfs_wrapstatus = LFS_WRAP_GOING;
+	mutex_exit(&lfs_lock);
+
+	LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_nextseg), bp);
+	DLOG((DLOG_SU, "lfs_newseg: seg %d := 0 in newseg\n",
+	      dtosn(fs, fs->lfs_nextseg)));
+	sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
+	sup->su_nbytes = 0;
+	sup->su_nsums = 0;
+	sup->su_ninos = 0;
+	LFS_WRITESEGENTRY(sup, fs, dtosn(fs, fs->lfs_nextseg), bp);
+
+	LFS_CLEANERINFO(cip, fs, bp);
+	--cip->clean;
+	++cip->dirty;
+	fs->lfs_nclean = cip->clean;
+	LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
+
+	fs->lfs_lastseg = fs->lfs_curseg;
+	fs->lfs_curseg = fs->lfs_nextseg;
+	skip_inval = 1;
+	for (sn = curseg = dtosn(fs, fs->lfs_curseg) + fs->lfs_interleave;;) {
+		sn = (sn + 1) % fs->lfs_nseg;
+
+		if (sn == curseg) {
+			if (skip_inval)
+				skip_inval = 0;
+			else
+				panic("lfs_nextseg: no clean segments");
+		}
+		LFS_SEGENTRY(sup, fs, sn, bp);
+		isdirty = sup->su_flags & (SEGUSE_DIRTY | (skip_inval ? SEGUSE_INVAL : 0));
+		/* Check SEGUSE_EMPTY as we go along */
+		if (isdirty && sup->su_nbytes == 0 &&
+		    !(sup->su_flags & SEGUSE_EMPTY))
+			LFS_WRITESEGENTRY(sup, fs, sn, bp);
+		else
+			brelse(bp, 0);
+
+		if (!isdirty)
+			break;
+	}
+	if (skip_inval == 0)
+		lfs_unset_inval_all(fs);
+
+	++fs->lfs_nactive;
+	fs->lfs_nextseg = sntod(fs, sn);
+	if (lfs_dostats) {
+		++lfs_stats.segsused;
+	}
+}
+
+static struct buf *
+lfs_newclusterbuf(struct lfs *fs, struct vnode *vp, daddr_t addr,
+    int n)
+{
+	struct lfs_cluster *cl;
+	struct buf **bpp, *bp;
+
+	ASSERT_SEGLOCK(fs);
+	cl = (struct lfs_cluster *)pool_get(&fs->lfs_clpool, PR_WAITOK);
+	bpp = (struct buf **)pool_get(&fs->lfs_bpppool, PR_WAITOK);
+	memset(cl, 0, sizeof(*cl));
+	cl->fs = fs;
+	cl->bpp = bpp;
+	cl->bufcount = 0;
+	cl->bufsize = 0;
+
+	/* If this segment is being written synchronously, note that */
+	if (fs->lfs_sp->seg_flags & SEGM_SYNC) {
+		cl->flags |= LFS_CL_SYNC;
+		cl->seg = fs->lfs_sp;
+		++cl->seg->seg_iocount;
+	}
+
+	/* Get an empty buffer header, or maybe one with something on it */
+	bp = getiobuf(vp, true);
+	bp->b_dev = NODEV;
+	bp->b_blkno = bp->b_lblkno = addr;
+	bp->b_iodone = lfs_cluster_callback;
+	bp->b_private = cl;
+
+	return bp;
+}
+
+int
+lfs_writeseg(struct lfs *fs, struct segment *sp)
+{
+	struct buf **bpp, *bp, *cbp, *newbp, *unbusybp;
+	SEGUSE *sup;
+	SEGSUM *ssp;
+	int i;
+	int do_again, nblocks, byteoffset;
+	size_t el_size;
+	struct lfs_cluster *cl;
+	u_short ninos;
+	struct vnode *devvp;
+	char *p = NULL;
+	struct vnode *vp;
+	int32_t *daddrp;	/* XXX ondisk32 */
+	int changed;
+	u_int32_t sum;
+#ifdef DEBUG
+	FINFO *fip;
+	int findex;
+#endif
+
+	ASSERT_SEGLOCK(fs);
+
+	ssp = (SEGSUM *)sp->segsum;
+
+	/*
+	 * If there are no buffers other than the segment summary to write,
+	 * don't do anything.  If we are the end of a dirop sequence, however,
+	 * write the empty segment summary anyway, to help out the
+	 * roll-forward agent.
+	 */
+	if ((nblocks = sp->cbpp - sp->bpp) == 1) {
+		if ((ssp->ss_flags & (SS_DIROP | SS_CONT)) != SS_DIROP)
+			return 0;
+	}
+
+	/* Note if partial segment is being written by the cleaner */
+	if (sp->seg_flags & SEGM_CLEAN)
+		ssp->ss_flags |= SS_CLEAN;
+
+	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+
+	/* Update the segment usage information. */
+	LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
+
+	/* Loop through all blocks, except the segment summary. */
+	for (bpp = sp->bpp; ++bpp < sp->cbpp; ) {
+		if ((*bpp)->b_vp != devvp) {
+			sup->su_nbytes += (*bpp)->b_bcount;
+			DLOG((DLOG_SU, "seg %" PRIu32 " += %ld for ino %d"
+			      " lbn %" PRId64 " db 0x%" PRIx64 "\n",
+			      sp->seg_number, (*bpp)->b_bcount,
+			      VTOI((*bpp)->b_vp)->i_number, (*bpp)->b_lblkno,
+			      (*bpp)->b_blkno));
+		}
+	}
+
+#ifdef DEBUG
+	/* Check for zero-length and zero-version FINFO entries. */
+	fip = (struct finfo *)((char *)ssp + SEGSUM_SIZE(fs));
+	for (findex = 0; findex < ssp->ss_nfinfo; findex++) {
+		KDASSERT(fip->fi_nblocks > 0);
+		KDASSERT(fip->fi_version > 0);
+		fip = (FINFO *)((char *)fip + FINFOSIZE +
+			sizeof(int32_t) * fip->fi_nblocks);
+	}
+#endif /* DEBUG */
+
+	ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs);
+	DLOG((DLOG_SU, "seg %d += %d for %d inodes\n",
+	      sp->seg_number, ssp->ss_ninos * sizeof (struct ufs1_dinode),
+	      ssp->ss_ninos));
+	sup->su_nbytes += ssp->ss_ninos * sizeof (struct ufs1_dinode);
+	/* sup->su_nbytes += fs->lfs_sumsize; */
+	if (fs->lfs_version == 1)
+		sup->su_olastmod = time_second;
+	else
+		sup->su_lastmod = time_second;
+	sup->su_ninos += ninos;
+	++sup->su_nsums;
+	fs->lfs_avail -= btofsb(fs, fs->lfs_sumsize);
+
+	do_again = !(bp->b_flags & B_GATHERED);
+	LFS_WRITESEGENTRY(sup, fs, sp->seg_number, bp); /* Ifile */
+
+	/*
+	 * Mark blocks B_BUSY, to prevent then from being changed between
+	 * the checksum computation and the actual write.
+	 *
+	 * If we are cleaning, check indirect blocks for UNWRITTEN, and if
+	 * there are any, replace them with copies that have UNASSIGNED
+	 * instead.
+	 */
+	mutex_enter(&bufcache_lock);
+	for (bpp = sp->bpp, i = nblocks - 1; i--;) {
+		++bpp;
+		bp = *bpp;
+		if (bp->b_iodone != NULL) {	 /* UBC or malloced buffer */
+			bp->b_cflags |= BC_BUSY;
+			continue;
+		}
+
+		while (bp->b_cflags & BC_BUSY) {
+			DLOG((DLOG_SEG, "lfs_writeseg: avoiding potential"
+			      " data summary corruption for ino %d, lbn %"
+			      PRId64 "\n",
+			      VTOI(bp->b_vp)->i_number, bp->b_lblkno));
+			bp->b_cflags |= BC_WANTED;
+			cv_wait(&bp->b_busy, &bufcache_lock);
+		}
+		bp->b_cflags |= BC_BUSY;
+		mutex_exit(&bufcache_lock);
+		unbusybp = NULL;
+
+		/*
+		 * Check and replace indirect block UNWRITTEN bogosity.
+		 * XXX See comment in lfs_writefile.
+		 */
+		if (bp->b_lblkno < 0 && bp->b_vp != devvp && bp->b_vp &&
+		   VTOI(bp->b_vp)->i_ffs1_blocks !=
+		   VTOI(bp->b_vp)->i_lfs_effnblks) {
+			DLOG((DLOG_VNODE, "lfs_writeseg: cleansing ino %d (%d != %d)\n",
+			      VTOI(bp->b_vp)->i_number,
+			      VTOI(bp->b_vp)->i_lfs_effnblks,
+			      VTOI(bp->b_vp)->i_ffs1_blocks));
+			/* Make a copy we'll make changes to */
+			newbp = lfs_newbuf(fs, bp->b_vp, bp->b_lblkno,
+					   bp->b_bcount, LFS_NB_IBLOCK);
+			newbp->b_blkno = bp->b_blkno;
+			memcpy(newbp->b_data, bp->b_data,
+			       newbp->b_bcount);
+
+			changed = 0;
+			/* XXX ondisk32 */
+			for (daddrp = (int32_t *)(newbp->b_data);
+			     daddrp < (int32_t *)((char *)newbp->b_data +
+						  newbp->b_bcount); daddrp++) {
+				if (*daddrp == UNWRITTEN) {
+					++changed;
+					*daddrp = 0;
+				}
+			}
+			/*
+			 * Get rid of the old buffer.  Don't mark it clean,
+			 * though, if it still has dirty data on it.
+			 */
+			if (changed) {
+				DLOG((DLOG_SEG, "lfs_writeseg: replacing UNWRITTEN(%d):"
+				      " bp = %p newbp = %p\n", changed, bp,
+				      newbp));
+				*bpp = newbp;
+				bp->b_flags &= ~B_GATHERED;
+				bp->b_error = 0;
+				if (bp->b_iodone != NULL) {
+					DLOG((DLOG_SEG, "lfs_writeseg: "
+					      "indir bp should not be B_CALL\n"));
+					biodone(bp);
+					bp = NULL;
+				} else {
+					/* Still on free list, leave it there */
+					unbusybp = bp;
+					/*
+					 * We have to re-decrement lfs_avail
+					 * since this block is going to come
+					 * back around to us in the next
+					 * segment.
+					 */
+					fs->lfs_avail -=
+					    btofsb(fs, bp->b_bcount);
+				}
+			} else {
+				lfs_freebuf(fs, newbp);
+			}
+		}
+		mutex_enter(&bufcache_lock);
+		if (unbusybp != NULL) {
+			unbusybp->b_cflags &= ~BC_BUSY;
+			if (unbusybp->b_cflags & BC_WANTED)
+				cv_broadcast(&bp->b_busy);
+		}
+	}
+	mutex_exit(&bufcache_lock);
+
+	/*
+	 * Compute checksum across data and then across summary; the first
+	 * block (the summary block) is skipped.  Set the create time here
+	 * so that it's guaranteed to be later than the inode mod times.
+	 */
+	sum = 0;
+	if (fs->lfs_version == 1)
+		el_size = sizeof(u_long);
+	else
+		el_size = sizeof(u_int32_t);
+	for (bpp = sp->bpp, i = nblocks - 1; i--; ) {
+		++bpp;
+		/* Loop through gop_write cluster blocks */
+		for (byteoffset = 0; byteoffset < (*bpp)->b_bcount;
+		     byteoffset += fs->lfs_bsize) {
+#ifdef LFS_USE_B_INVAL
+			if (((*bpp)->b_cflags & BC_INVAL) != 0 &&
+			    (*bpp)->b_iodone != NULL) {
+				if (copyin((void *)(*bpp)->b_saveaddr +
+					   byteoffset, dp, el_size)) {
+					panic("lfs_writeseg: copyin failed [1]:"
+						" ino %d blk %" PRId64,
+						VTOI((*bpp)->b_vp)->i_number,
+						(*bpp)->b_lblkno);
+				}
+			} else
+#endif /* LFS_USE_B_INVAL */
+			{
+				sum = lfs_cksum_part((char *)
+				    (*bpp)->b_data + byteoffset, el_size, sum);
+			}
+		}
+	}
+	if (fs->lfs_version == 1)
+		ssp->ss_ocreate = time_second;
+	else {
+		ssp->ss_create = time_second;
+		ssp->ss_serial = ++fs->lfs_serial;
+		ssp->ss_ident  = fs->lfs_ident;
+	}
+	ssp->ss_datasum = lfs_cksum_fold(sum);
+	ssp->ss_sumsum = cksum(&ssp->ss_datasum,
+	    fs->lfs_sumsize - sizeof(ssp->ss_sumsum));
+
+	mutex_enter(&lfs_lock);
+	fs->lfs_bfree -= (btofsb(fs, ninos * fs->lfs_ibsize) +
+			  btofsb(fs, fs->lfs_sumsize));
+	fs->lfs_dmeta += (btofsb(fs, ninos * fs->lfs_ibsize) +
+			  btofsb(fs, fs->lfs_sumsize));
+	mutex_exit(&lfs_lock);
+
+	/*
+	 * When we simply write the blocks we lose a rotation for every block
+	 * written.  To avoid this problem, we cluster the buffers into a
+	 * chunk and write the chunk.  MAXPHYS is the largest size I/O
+	 * devices can handle, use that for the size of the chunks.
+	 *
+	 * Blocks that are already clusters (from GOP_WRITE), however, we
+	 * don't bother to copy into other clusters.
+	 */
+
+#define CHUNKSIZE MAXPHYS
+
+	if (devvp == NULL)
+		panic("devvp is NULL");
+	for (bpp = sp->bpp, i = nblocks; i;) {
+		cbp = lfs_newclusterbuf(fs, devvp, (*bpp)->b_blkno, i);
+		cl = cbp->b_private;
+
+		cbp->b_flags |= B_ASYNC;
+		cbp->b_cflags |= BC_BUSY;
+		cbp->b_bcount = 0;
+
+#if defined(DEBUG) && defined(DIAGNOSTIC)
+		if (bpp - sp->bpp > (fs->lfs_sumsize - SEGSUM_SIZE(fs))
+		    / sizeof(int32_t)) {
+			panic("lfs_writeseg: real bpp overwrite");
+		}
+		if (bpp - sp->bpp > segsize(fs) / fs->lfs_fsize) {
+			panic("lfs_writeseg: theoretical bpp overwrite");
+		}
+#endif
+
+		/*
+		 * Construct the cluster.
+		 */
+		mutex_enter(&lfs_lock);
+		++fs->lfs_iocount;
+		mutex_exit(&lfs_lock);
+		while (i && cbp->b_bcount < CHUNKSIZE) {
+			bp = *bpp;
+
+			if (bp->b_bcount > (CHUNKSIZE - cbp->b_bcount))
+				break;
+			if (cbp->b_bcount > 0 && !(cl->flags & LFS_CL_MALLOC))
+				break;
+
+			/* Clusters from GOP_WRITE are expedited */
+			if (bp->b_bcount > fs->lfs_bsize) {
+				if (cbp->b_bcount > 0)
+					/* Put in its own buffer */
+					break;
+				else {
+					cbp->b_data = bp->b_data;
+				}
+			} else if (cbp->b_bcount == 0) {
+				p = cbp->b_data = lfs_malloc(fs, CHUNKSIZE,
+							     LFS_NB_CLUSTER);
+				cl->flags |= LFS_CL_MALLOC;
+			}
+#ifdef DIAGNOSTIC
+			if (dtosn(fs, dbtofsb(fs, bp->b_blkno +
+					      btodb(bp->b_bcount - 1))) !=
+			    sp->seg_number) {
+				printf("blk size %d daddr %" PRIx64
+				    " not in seg %d\n",
+				    bp->b_bcount, bp->b_blkno,
+				    sp->seg_number);
+				panic("segment overwrite");
+			}
+#endif
+
+#ifdef LFS_USE_B_INVAL
+			/*
+			 * Fake buffers from the cleaner are marked as B_INVAL.
+			 * We need to copy the data from user space rather than
+			 * from the buffer indicated.
+			 * XXX == what do I do on an error?
+			 */
+			if ((bp->b_cflags & BC_INVAL) != 0 &&
+			    bp->b_iodone != NULL) {
+				if (copyin(bp->b_saveaddr, p, bp->b_bcount))
+					panic("lfs_writeseg: "
+					    "copyin failed [2]");
+			} else
+#endif /* LFS_USE_B_INVAL */
+			if (cl->flags & LFS_CL_MALLOC) {
+				/* copy data into our cluster. */
+				memcpy(p, bp->b_data, bp->b_bcount);
+				p += bp->b_bcount;
+			}
+
+			cbp->b_bcount += bp->b_bcount;
+			cl->bufsize += bp->b_bcount;
+
+			bp->b_flags &= ~B_READ;
+			bp->b_error = 0;
+			cl->bpp[cl->bufcount++] = bp;
+
+			vp = bp->b_vp;
+			mutex_enter(&bufcache_lock);
+			mutex_enter(vp->v_interlock);
+			bp->b_oflags &= ~(BO_DELWRI | BO_DONE);
+			reassignbuf(bp, vp);
+			vp->v_numoutput++;
+			mutex_exit(vp->v_interlock);
+			mutex_exit(&bufcache_lock);
+
+			bpp++;
+			i--;
+		}
+		if (fs->lfs_sp->seg_flags & SEGM_SYNC)
+			BIO_SETPRIO(cbp, BPRIO_TIMECRITICAL);
+		else
+			BIO_SETPRIO(cbp, BPRIO_TIMELIMITED);
+		mutex_enter(devvp->v_interlock);
+		devvp->v_numoutput++;
+		mutex_exit(devvp->v_interlock);
+		VOP_STRATEGY(devvp, cbp);
+		curlwp->l_ru.ru_oublock++;
+	}
+
+	if (lfs_dostats) {
+		++lfs_stats.psegwrites;
+		lfs_stats.blocktot += nblocks - 1;
+		if (fs->lfs_sp->seg_flags & SEGM_SYNC)
+			++lfs_stats.psyncwrites;
+		if (fs->lfs_sp->seg_flags & SEGM_CLEAN) {
+			++lfs_stats.pcleanwrites;
+			lfs_stats.cleanblocks += nblocks - 1;
+		}
+	}
+
+	return (lfs_initseg(fs) || do_again);
+}
+
+void
+lfs_writesuper(struct lfs *fs, daddr_t daddr)
+{
+	struct buf *bp;
+	struct vnode *devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+	int s;
+
+	ASSERT_MAYBE_SEGLOCK(fs);
+#ifdef DIAGNOSTIC
+	KASSERT(fs->lfs_magic == LFS_MAGIC);
+#endif
+	/*
+	 * If we can write one superblock while another is in
+	 * progress, we risk not having a complete checkpoint if we crash.
+	 * So, block here if a superblock write is in progress.
+	 */
+	mutex_enter(&lfs_lock);
+	s = splbio();
+	while (fs->lfs_sbactive) {
+		mtsleep(&fs->lfs_sbactive, PRIBIO+1, "lfs sb", 0,
+			&lfs_lock);
+	}
+	fs->lfs_sbactive = daddr;
+	splx(s);
+	mutex_exit(&lfs_lock);
+
+	/* Set timestamp of this version of the superblock */
+	if (fs->lfs_version == 1)
+		fs->lfs_otstamp = time_second;
+	fs->lfs_tstamp = time_second;
+
+	/* Checksum the superblock and copy it into a buffer. */
+	fs->lfs_cksum = lfs_sb_cksum(&(fs->lfs_dlfs));
+	bp = lfs_newbuf(fs, devvp,
+	    fsbtodb(fs, daddr), LFS_SBPAD, LFS_NB_SBLOCK);
+	memset((char *)bp->b_data + sizeof(struct dlfs), 0,
+	    LFS_SBPAD - sizeof(struct dlfs));
+	*(struct dlfs *)bp->b_data = fs->lfs_dlfs;
+
+	bp->b_cflags |= BC_BUSY;
+	bp->b_flags = (bp->b_flags & ~B_READ) | B_ASYNC;
+	bp->b_oflags &= ~(BO_DONE | BO_DELWRI);
+	bp->b_error = 0;
+	bp->b_iodone = lfs_supercallback;
+
+	if (fs->lfs_sp != NULL && fs->lfs_sp->seg_flags & SEGM_SYNC)
+		BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+	else
+		BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
+	curlwp->l_ru.ru_oublock++;
+
+	mutex_enter(devvp->v_interlock);
+	devvp->v_numoutput++;
+	mutex_exit(devvp->v_interlock);
+
+	mutex_enter(&lfs_lock);
+	++fs->lfs_iocount;
+	mutex_exit(&lfs_lock);
+	VOP_STRATEGY(devvp, bp);
+}
+
+/*
+ * Logical block number match routines used when traversing the dirty block
+ * chain.
+ */
+int
+lfs_match_fake(struct lfs *fs, struct buf *bp)
+{
+
+	ASSERT_SEGLOCK(fs);
+	return LFS_IS_MALLOC_BUF(bp);
+}
+
+#if 0
+int
+lfs_match_real(struct lfs *fs, struct buf *bp)
+{
+
+	ASSERT_SEGLOCK(fs);
+	return (lfs_match_data(fs, bp) && !lfs_match_fake(fs, bp));
+}
+#endif
+
+int
+lfs_match_data(struct lfs *fs, struct buf *bp)
+{
+
+	ASSERT_SEGLOCK(fs);
+	return (bp->b_lblkno >= 0);
+}
+
+int
+lfs_match_indir(struct lfs *fs, struct buf *bp)
+{
+	daddr_t lbn;
+
+	ASSERT_SEGLOCK(fs);
+	lbn = bp->b_lblkno;
+	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0);
+}
+
+int
+lfs_match_dindir(struct lfs *fs, struct buf *bp)
+{
+	daddr_t lbn;
+
+	ASSERT_SEGLOCK(fs);
+	lbn = bp->b_lblkno;
+	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1);
+}
+
+int
+lfs_match_tindir(struct lfs *fs, struct buf *bp)
+{
+	daddr_t lbn;
+
+	ASSERT_SEGLOCK(fs);
+	lbn = bp->b_lblkno;
+	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2);
+}
+
+static void
+lfs_free_aiodone(struct buf *bp)
+{
+	struct lfs *fs;
+
+	KERNEL_LOCK(1, curlwp);
+	fs = bp->b_private;
+	ASSERT_NO_SEGLOCK(fs);
+	lfs_freebuf(fs, bp);
+	KERNEL_UNLOCK_LAST(curlwp);
+}
+
+static void
+lfs_super_aiodone(struct buf *bp)
+{
+	struct lfs *fs;
+
+	KERNEL_LOCK(1, curlwp);
+	fs = bp->b_private;
+	ASSERT_NO_SEGLOCK(fs);
+	mutex_enter(&lfs_lock);
+	fs->lfs_sbactive = 0;
+	if (--fs->lfs_iocount <= 1)
+		wakeup(&fs->lfs_iocount);
+	wakeup(&fs->lfs_sbactive);
+	mutex_exit(&lfs_lock);
+	lfs_freebuf(fs, bp);
+	KERNEL_UNLOCK_LAST(curlwp);
+}
+
+static void
+lfs_cluster_aiodone(struct buf *bp)
+{
+	struct lfs_cluster *cl;
+	struct lfs *fs;
+	struct buf *tbp, *fbp;
+	struct vnode *vp, *devvp, *ovp;
+	struct inode *ip;
+	int error;
+
+	KERNEL_LOCK(1, curlwp);
+
+	error = bp->b_error;
+	cl = bp->b_private;
+	fs = cl->fs;
+	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+	ASSERT_NO_SEGLOCK(fs);
+
+	/* Put the pages back, and release the buffer */
+	while (cl->bufcount--) {
+		tbp = cl->bpp[cl->bufcount];
+		KASSERT(tbp->b_cflags & BC_BUSY);
+		if (error) {
+			tbp->b_error = error;
+		}
+
+		/*
+		 * We're done with tbp.	 If it has not been re-dirtied since
+		 * the cluster was written, free it.  Otherwise, keep it on
+		 * the locked list to be written again.
+		 */
+		vp = tbp->b_vp;
+
+		tbp->b_flags &= ~B_GATHERED;
+
+		LFS_BCLEAN_LOG(fs, tbp);
+
+		mutex_enter(&bufcache_lock);
+		if (tbp->b_iodone == NULL) {
+			KASSERT(tbp->b_flags & B_LOCKED);
+			bremfree(tbp);
+			if (vp) {
+				mutex_enter(vp->v_interlock);
+				reassignbuf(tbp, vp);
+				mutex_exit(vp->v_interlock);
+			}
+			tbp->b_flags |= B_ASYNC; /* for biodone */
+		}
+
+		if (((tbp->b_flags | tbp->b_oflags) &
+		    (B_LOCKED | BO_DELWRI)) == B_LOCKED)
+			LFS_UNLOCK_BUF(tbp);
+
+		if (tbp->b_oflags & BO_DONE) {
+			DLOG((DLOG_SEG, "blk %d biodone already (flags %lx)\n",
+				cl->bufcount, (long)tbp->b_flags));
+		}
+
+		if (tbp->b_iodone != NULL && !LFS_IS_MALLOC_BUF(tbp)) {
+			/*
+			 * A buffer from the page daemon.
+			 * We use the same iodone as it does,
+			 * so we must manually disassociate its
+			 * buffers from the vp.
+			 */
+			if ((ovp = tbp->b_vp) != NULL) {
+				/* This is just silly */
+				mutex_enter(ovp->v_interlock);
+				brelvp(tbp);
+				mutex_exit(ovp->v_interlock);
+				tbp->b_vp = vp;
+				tbp->b_objlock = vp->v_interlock;
+			}
+			/* Put it back the way it was */
+			tbp->b_flags |= B_ASYNC;
+			/* Master buffers have BC_AGE */
+			if (tbp->b_private == tbp)
+				tbp->b_cflags |= BC_AGE;
+		}
+		mutex_exit(&bufcache_lock);
+
+		biodone(tbp);
+
+		/*
+		 * If this is the last block for this vnode, but
+		 * there are other blocks on its dirty list,
+		 * set IN_MODIFIED/IN_CLEANING depending on what
+		 * sort of block.  Only do this for our mount point,
+		 * not for, e.g., inode blocks that are attached to
+		 * the devvp.
+		 * XXX KS - Shouldn't we set *both* if both types
+		 * of blocks are present (traverse the dirty list?)
+		 */
+		mutex_enter(&lfs_lock);
+		mutex_enter(vp->v_interlock);
+		if (vp != devvp && vp->v_numoutput == 0 &&
+		    (fbp = LIST_FIRST(&vp->v_dirtyblkhd)) != NULL) {
+			ip = VTOI(vp);
+			DLOG((DLOG_SEG, "lfs_cluster_aiodone: mark ino %d\n",
+			       ip->i_number));
+			if (LFS_IS_MALLOC_BUF(fbp))
+				LFS_SET_UINO(ip, IN_CLEANING);
+			else
+				LFS_SET_UINO(ip, IN_MODIFIED);
+		}
+		cv_broadcast(&vp->v_cv);
+		mutex_exit(vp->v_interlock);
+		mutex_exit(&lfs_lock);
+	}
+
+	/* Fix up the cluster buffer, and release it */
+	if (cl->flags & LFS_CL_MALLOC)
+		lfs_free(fs, bp->b_data, LFS_NB_CLUSTER);
+	putiobuf(bp);
+
+	/* Note i/o done */
+	if (cl->flags & LFS_CL_SYNC) {
+		if (--cl->seg->seg_iocount == 0)
+			wakeup(&cl->seg->seg_iocount);
+	}
+	mutex_enter(&lfs_lock);
+#ifdef DIAGNOSTIC
+	if (fs->lfs_iocount == 0)
+		panic("lfs_cluster_aiodone: zero iocount");
+#endif
+	if (--fs->lfs_iocount <= 1)
+		wakeup(&fs->lfs_iocount);
+	mutex_exit(&lfs_lock);
+
+	KERNEL_UNLOCK_LAST(curlwp);
+
+	pool_put(&fs->lfs_bpppool, cl->bpp);
+	cl->bpp = NULL;
+	pool_put(&fs->lfs_clpool, cl);
+}
+
+static void
+lfs_generic_callback(struct buf *bp, void (*aiodone)(struct buf *))
+{
+	/* reset b_iodone for when this is a single-buf i/o. */
+	bp->b_iodone = aiodone;
+
+	workqueue_enqueue(uvm.aiodone_queue, &bp->b_work, NULL);
+}
+
+static void
+lfs_cluster_callback(struct buf *bp)
+{
+
+	lfs_generic_callback(bp, lfs_cluster_aiodone);
+}
+
+void
+lfs_supercallback(struct buf *bp)
+{
+
+	lfs_generic_callback(bp, lfs_super_aiodone);
+}
+
+/*
+ * The only buffers that are going to hit these functions are the
+ * segment write blocks, or the segment summaries, or the superblocks.
+ *
+ * All of the above are created by lfs_newbuf, and so do not need to be
+ * released via brelse.
+ */
+void
+lfs_callback(struct buf *bp)
+{
+
+	lfs_generic_callback(bp, lfs_free_aiodone);
+}
+
+/*
+ * Shellsort (diminishing increment sort) from Data Structures and
+ * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290;
+ * see also Knuth Vol. 3, page 84.  The increments are selected from
+ * formula (8), page 95.  Roughly O(N^3/2).
+ */
+/*
+ * This is our own private copy of shellsort because we want to sort
+ * two parallel arrays (the array of buffer pointers and the array of
+ * logical block numbers) simultaneously.  Note that we cast the array
+ * of logical block numbers to a unsigned in this routine so that the
+ * negative block numbers (meta data blocks) sort AFTER the data blocks.
+ */
+
+void
+lfs_shellsort(struct buf **bp_array, int32_t *lb_array, int nmemb, int size)
+{
+	static int __rsshell_increments[] = { 4, 1, 0 };
+	int incr, *incrp, t1, t2;
+	struct buf *bp_temp;
+
+#ifdef DEBUG
+	incr = 0;
+	for (t1 = 0; t1 < nmemb; t1++) {
+		for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) {
+			if (lb_array[incr++] != bp_array[t1]->b_lblkno + t2) {
+				/* dump before panic */
+				printf("lfs_shellsort: nmemb=%d, size=%d\n",
+				    nmemb, size);
+				incr = 0;
+				for (t1 = 0; t1 < nmemb; t1++) {
+					const struct buf *bp = bp_array[t1];
+
+					printf("bp[%d]: lbn=%" PRIu64 ", size=%"
+					    PRIu64 "\n", t1,
+					    (uint64_t)bp->b_bcount,
+					    (uint64_t)bp->b_lblkno);
+					printf("lbns:");
+					for (t2 = 0; t2 * size < bp->b_bcount;
+					    t2++) {
+						printf(" %" PRId32,
+						    lb_array[incr++]);
+					}
+					printf("\n");
+				}
+				panic("lfs_shellsort: inconsistent input");
+			}
+		}
+	}
+#endif
+
+	for (incrp = __rsshell_increments; (incr = *incrp++) != 0;)
+		for (t1 = incr; t1 < nmemb; ++t1)
+			for (t2 = t1 - incr; t2 >= 0;)
+				if ((u_int32_t)bp_array[t2]->b_lblkno >
+				    (u_int32_t)bp_array[t2 + incr]->b_lblkno) {
+					bp_temp = bp_array[t2];
+					bp_array[t2] = bp_array[t2 + incr];
+					bp_array[t2 + incr] = bp_temp;
+					t2 -= incr;
+				} else
+					break;
+
+	/* Reform the list of logical blocks */
+	incr = 0;
+	for (t1 = 0; t1 < nmemb; t1++) {
+		for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) {
+			lb_array[incr++] = bp_array[t1]->b_lblkno + t2;
+		}
+	}
+}
+
+/*
+ * Call vget with LK_NOWAIT.  If we are the one who holds VI_XLOCK,
+ * however, we must press on.  Just fake success in that case.
+ */
+int
+lfs_vref(struct vnode *vp)
+{
+	int error;
+	struct lfs *fs;
+
+	KASSERT(mutex_owned(vp->v_interlock));
+
+	fs = VTOI(vp)->i_lfs;
+
+	ASSERT_MAYBE_SEGLOCK(fs);
+
+	/*
+	 * If we return 1 here during a flush, we risk vinvalbuf() not
+	 * being able to flush all of the pages from this vnode, which
+	 * will cause it to panic.  So, return 0 if a flush is in progress.
+	 */
+	error = vget(vp, LK_NOWAIT);
+	if (error == EBUSY && IS_FLUSHING(VTOI(vp)->i_lfs, vp)) {
+		++fs->lfs_flushvp_fakevref;
+		return 0;
+	}
+	return error;
+}
+
+/*
+ * This is vrele except that we do not want to VOP_INACTIVE this vnode. We
+ * inline vrele here to avoid the vn_lock and VOP_INACTIVE call at the end.
+ */
+void
+lfs_vunref(struct vnode *vp)
+{
+	struct lfs *fs;
+
+	fs = VTOI(vp)->i_lfs;
+	ASSERT_MAYBE_SEGLOCK(fs);
+
+	/*
+	 * Analogous to lfs_vref, if the node is flushing, fake it.
+	 */
+	if (IS_FLUSHING(fs, vp) && fs->lfs_flushvp_fakevref) {
+		--fs->lfs_flushvp_fakevref;
+		return;
+	}
+
+	/* does not call inactive */
+	mutex_enter(vp->v_interlock);
+	vrelel(vp, 0);
+}
+
+/*
+ * We use this when we have vnodes that were loaded in solely for cleaning.
+ * There is no reason to believe that these vnodes will be referenced again
+ * soon, since the cleaning process is unrelated to normal filesystem
+ * activity.  Putting cleaned vnodes at the tail of the list has the effect
+ * of flushing the vnode LRU.  So, put vnodes that were loaded only for
+ * cleaning at the head of the list, instead.
+ */
+void
+lfs_vunref_head(struct vnode *vp)
+{
+
+	ASSERT_SEGLOCK(VTOI(vp)->i_lfs);
+
+	/* does not call inactive, inserts non-held vnode at head of freelist */
+	mutex_enter(vp->v_interlock);
+	vrelel(vp, 0);
+}
+
+
+/*
+ * Set up an FINFO entry for a new file.  The fip pointer is assumed to 
+ * point at uninitialized space.
+ */
+void
+lfs_acquire_finfo(struct lfs *fs, ino_t ino, int vers)
+{
+	struct segment *sp = fs->lfs_sp;
+
+	KASSERT(vers > 0);
+
+	if (sp->seg_bytes_left < fs->lfs_bsize ||
+	    sp->sum_bytes_left < sizeof(struct finfo))
+		(void) lfs_writeseg(fs, fs->lfs_sp);
+	
+	sp->sum_bytes_left -= FINFOSIZE;
+	++((SEGSUM *)(sp->segsum))->ss_nfinfo;
+	sp->fip->fi_nblocks = 0;
+	sp->fip->fi_ino = ino;
+	sp->fip->fi_version = vers;
+}
+
+/*
+ * Release the FINFO entry, either clearing out an unused entry or
+ * advancing us to the next available entry.
+ */
+void
+lfs_release_finfo(struct lfs *fs)
+{
+	struct segment *sp = fs->lfs_sp;
+
+	if (sp->fip->fi_nblocks != 0) {
+		sp->fip = (FINFO*)((char *)sp->fip + FINFOSIZE +
+			sizeof(int32_t) * sp->fip->fi_nblocks);
+		sp->start_lbp = &sp->fip->fi_blocks[0];
+	} else {
+		sp->sum_bytes_left += FINFOSIZE;
+		--((SEGSUM *)(sp->segsum))->ss_nfinfo;
+	}
+}
diff --git a/sys/ufs/lfs/lfs_subr.c b/sys/ufs/lfs/lfs_subr.c
new file mode 100644
index 000000000..4da38aae3
--- /dev/null
+++ b/sys/ufs/lfs/lfs_subr.c
@@ -0,0 +1,661 @@
+/*	$NetBSD: lfs_subr.c,v 1.76 2010/06/25 10:03:52 hannken Exp $	*/
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)lfs_subr.c	8.4 (Berkeley) 5/8/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.76 2010/06/25 10:03:52 hannken Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+
+#ifdef DEBUG
+const char *lfs_res_names[LFS_NB_COUNT] = {
+	"summary",
+	"superblock",
+	"file block",
+	"cluster",
+	"clean",
+	"blkiov",
+};
+#endif
+
+int lfs_res_qty[LFS_NB_COUNT] = {
+	LFS_N_SUMMARIES,
+	LFS_N_SBLOCKS,
+	LFS_N_IBLOCKS,
+	LFS_N_CLUSTERS,
+	LFS_N_CLEAN,
+	LFS_N_BLKIOV,
+};
+
+void
+lfs_setup_resblks(struct lfs *fs)
+{
+	int i, j;
+	int maxbpp;
+
+	ASSERT_NO_SEGLOCK(fs);
+	fs->lfs_resblk = (res_t *)malloc(LFS_N_TOTAL * sizeof(res_t), M_SEGMENT,
+					  M_WAITOK);
+	for (i = 0; i < LFS_N_TOTAL; i++) {
+		fs->lfs_resblk[i].inuse = 0;
+		fs->lfs_resblk[i].p = NULL;
+	}
+	for (i = 0; i < LFS_RESHASH_WIDTH; i++)
+		LIST_INIT(fs->lfs_reshash + i);
+
+	/*
+	 * These types of allocations can be larger than a page,
+	 * so we can't use the pool subsystem for them.
+	 */
+	for (i = 0, j = 0; j < LFS_N_SUMMARIES; j++, i++)
+		fs->lfs_resblk[i].size = fs->lfs_sumsize;
+	for (j = 0; j < LFS_N_SBLOCKS; j++, i++)
+		fs->lfs_resblk[i].size = LFS_SBPAD;
+	for (j = 0; j < LFS_N_IBLOCKS; j++, i++)
+		fs->lfs_resblk[i].size = fs->lfs_bsize;
+	for (j = 0; j < LFS_N_CLUSTERS; j++, i++)
+		fs->lfs_resblk[i].size = MAXPHYS;
+	for (j = 0; j < LFS_N_CLEAN; j++, i++)
+		fs->lfs_resblk[i].size = MAXPHYS;
+	for (j = 0; j < LFS_N_BLKIOV; j++, i++)
+		fs->lfs_resblk[i].size = LFS_MARKV_MAXBLKCNT * sizeof(BLOCK_INFO);
+
+	for (i = 0; i < LFS_N_TOTAL; i++) {
+		fs->lfs_resblk[i].p = malloc(fs->lfs_resblk[i].size,
+					     M_SEGMENT, M_WAITOK);
+	}
+
+	/*
+	 * Initialize pools for small types (XXX is BPP small?)
+	 */
+	pool_init(&fs->lfs_clpool, sizeof(struct lfs_cluster), 0, 0, 0,
+		"lfsclpl", &pool_allocator_nointr, IPL_NONE);
+	pool_init(&fs->lfs_segpool, sizeof(struct segment), 0, 0, 0,
+		"lfssegpool", &pool_allocator_nointr, IPL_NONE);
+	maxbpp = ((fs->lfs_sumsize - SEGSUM_SIZE(fs)) / sizeof(int32_t) + 2);
+	maxbpp = MIN(maxbpp, segsize(fs) / fs->lfs_fsize + 2);
+	pool_init(&fs->lfs_bpppool, maxbpp * sizeof(struct buf *), 0, 0, 0,
+		"lfsbpppl", &pool_allocator_nointr, IPL_NONE);
+}
+
+void
+lfs_free_resblks(struct lfs *fs)
+{
+	int i;
+
+	pool_destroy(&fs->lfs_bpppool);
+	pool_destroy(&fs->lfs_segpool);
+	pool_destroy(&fs->lfs_clpool);
+
+	mutex_enter(&lfs_lock);
+	for (i = 0; i < LFS_N_TOTAL; i++) {
+		while (fs->lfs_resblk[i].inuse)
+			mtsleep(&fs->lfs_resblk, PRIBIO + 1, "lfs_free", 0,
+				&lfs_lock);
+		if (fs->lfs_resblk[i].p != NULL)
+			free(fs->lfs_resblk[i].p, M_SEGMENT);
+	}
+	free(fs->lfs_resblk, M_SEGMENT);
+	mutex_exit(&lfs_lock);
+}
+
+static unsigned int
+lfs_mhash(void *vp)
+{
+	return (unsigned int)(((unsigned long)vp) >> 2) % LFS_RESHASH_WIDTH;
+}
+
+/*
+ * Return memory of the given size for the given purpose, or use one of a
+ * number of spare last-resort buffers, if malloc returns NULL.
+ */
+void *
+lfs_malloc(struct lfs *fs, size_t size, int type)
+{
+	struct lfs_res_blk *re;
+	void *r;
+	int i, s, start;
+	unsigned int h;
+
+	ASSERT_MAYBE_SEGLOCK(fs);
+	r = NULL;
+
+	/* If no mem allocated for this type, it just waits */
+	if (lfs_res_qty[type] == 0) {
+		r = malloc(size, M_SEGMENT, M_WAITOK);
+		return r;
+	}
+
+	/* Otherwise try a quick malloc, and if it works, great */
+	if ((r = malloc(size, M_SEGMENT, M_NOWAIT)) != NULL) {
+		return r;
+	}
+
+	/*
+	 * If malloc returned NULL, we are forced to use one of our
+	 * reserve blocks.  We have on hand at least one summary block,
+	 * at least one cluster block, at least one superblock,
+	 * and several indirect blocks.
+	 */
+
+	mutex_enter(&lfs_lock);
+	/* skip over blocks of other types */
+	for (i = 0, start = 0; i < type; i++)
+		start += lfs_res_qty[i];
+	while (r == NULL) {
+		for (i = 0; i < lfs_res_qty[type]; i++) {
+			if (fs->lfs_resblk[start + i].inuse == 0) {
+				re = fs->lfs_resblk + start + i;
+				re->inuse = 1;
+				r = re->p;
+				KASSERT(re->size >= size);
+				h = lfs_mhash(r);
+				s = splbio();
+				LIST_INSERT_HEAD(&fs->lfs_reshash[h], re, res);
+				splx(s);
+				mutex_exit(&lfs_lock);
+				return r;
+			}
+		}
+		DLOG((DLOG_MALLOC, "sleeping on %s (%d)\n",
+		      lfs_res_names[type], lfs_res_qty[type]));
+		mtsleep(&fs->lfs_resblk, PVM, "lfs_malloc", 0,
+			&lfs_lock);
+		DLOG((DLOG_MALLOC, "done sleeping on %s\n",
+		      lfs_res_names[type]));
+	}
+	/* NOTREACHED */
+	mutex_exit(&lfs_lock);
+	return r;
+}
+
+void
+lfs_free(struct lfs *fs, void *p, int type)
+{
+	int s;
+	unsigned int h;
+	res_t *re;
+#ifdef DEBUG
+	int i;
+#endif
+
+	ASSERT_MAYBE_SEGLOCK(fs);
+	h = lfs_mhash(p);
+	mutex_enter(&lfs_lock);
+	s = splbio();
+	LIST_FOREACH(re, &fs->lfs_reshash[h], res) {
+		if (re->p == p) {
+			KASSERT(re->inuse == 1);
+			LIST_REMOVE(re, res);
+			re->inuse = 0;
+			wakeup(&fs->lfs_resblk);
+			splx(s);
+			mutex_exit(&lfs_lock);
+			return;
+		}
+	}
+#ifdef DEBUG
+	for (i = 0; i < LFS_N_TOTAL; i++) {
+		if (fs->lfs_resblk[i].p == p)
+			panic("lfs_free: inconsistent reserved block");
+	}
+#endif
+	splx(s);
+	mutex_exit(&lfs_lock);
+	
+	/*
+	 * If we didn't find it, free it.
+	 */
+	free(p, M_SEGMENT);
+}
+
+/*
+ * lfs_seglock --
+ *	Single thread the segment writer.
+ */
+int
+lfs_seglock(struct lfs *fs, unsigned long flags)
+{
+	struct segment *sp;
+
+	mutex_enter(&lfs_lock);
+	if (fs->lfs_seglock) {
+		if (fs->lfs_lockpid == curproc->p_pid &&
+		    fs->lfs_locklwp == curlwp->l_lid) {
+			++fs->lfs_seglock;
+			fs->lfs_sp->seg_flags |= flags;
+			mutex_exit(&lfs_lock);
+			return 0;
+		} else if (flags & SEGM_PAGEDAEMON) {
+			mutex_exit(&lfs_lock);
+			return EWOULDBLOCK;
+		} else {
+			while (fs->lfs_seglock) {
+				(void)mtsleep(&fs->lfs_seglock, PRIBIO + 1,
+					"lfs_seglock", 0, &lfs_lock);
+			}
+		}
+	}
+
+	fs->lfs_seglock = 1;
+	fs->lfs_lockpid = curproc->p_pid;
+	fs->lfs_locklwp = curlwp->l_lid;
+	mutex_exit(&lfs_lock);
+	fs->lfs_cleanind = 0;
+
+#ifdef DEBUG
+	LFS_ENTER_LOG("seglock", __FILE__, __LINE__, 0, flags, curproc->p_pid);
+#endif
+	/* Drain fragment size changes out */
+	rw_enter(&fs->lfs_fraglock, RW_WRITER);
+
+	sp = fs->lfs_sp = pool_get(&fs->lfs_segpool, PR_WAITOK);
+	sp->bpp = pool_get(&fs->lfs_bpppool, PR_WAITOK);
+	sp->seg_flags = flags;
+	sp->vp = NULL;
+	sp->seg_iocount = 0;
+	(void) lfs_initseg(fs);
+
+	/*
+	 * Keep a cumulative count of the outstanding I/O operations.  If the
+	 * disk drive catches up with us it could go to zero before we finish,
+	 * so we artificially increment it by one until we've scheduled all of
+	 * the writes we intend to do.
+	 */
+	mutex_enter(&lfs_lock);
+	++fs->lfs_iocount;
+	mutex_exit(&lfs_lock);
+	return 0;
+}
+
+static void lfs_unmark_dirop(struct lfs *);
+
+static void
+lfs_unmark_dirop(struct lfs *fs)
+{
+	struct inode *ip, *nip;
+	struct vnode *vp;
+	int doit;
+
+	ASSERT_NO_SEGLOCK(fs);
+	mutex_enter(&lfs_lock);
+	doit = !(fs->lfs_flags & LFS_UNDIROP);
+	if (doit)
+		fs->lfs_flags |= LFS_UNDIROP;
+	if (!doit) {
+		mutex_exit(&lfs_lock);
+		return;
+	}
+
+	for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
+		nip = TAILQ_NEXT(ip, i_lfs_dchain);
+		vp = ITOV(ip);
+		if ((VTOI(vp)->i_flag & (IN_ADIROP | IN_ALLMOD)) == 0) {
+			--lfs_dirvcount;
+			--fs->lfs_dirvcount;
+			vp->v_uflag &= ~VU_DIROP;
+			TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+			wakeup(&lfs_dirvcount);
+			fs->lfs_unlockvp = vp;
+			mutex_exit(&lfs_lock);
+			vrele(vp);
+			mutex_enter(&lfs_lock);
+			fs->lfs_unlockvp = NULL;
+		}
+	}
+
+	fs->lfs_flags &= ~LFS_UNDIROP;
+	wakeup(&fs->lfs_flags);
+	mutex_exit(&lfs_lock);
+}
+
+static void
+lfs_auto_segclean(struct lfs *fs)
+{
+	int i, error, s, waited;
+
+	ASSERT_SEGLOCK(fs);
+	/*
+	 * Now that we've swapped lfs_activesb, but while we still
+	 * hold the segment lock, run through the segment list marking
+	 * the empty ones clean.
+	 * XXX - do we really need to do them all at once?
+	 */
+	waited = 0;
+	for (i = 0; i < fs->lfs_nseg; i++) {
+		if ((fs->lfs_suflags[0][i] &
+		     (SEGUSE_ACTIVE | SEGUSE_DIRTY | SEGUSE_EMPTY)) ==
+		    (SEGUSE_DIRTY | SEGUSE_EMPTY) &&
+		    (fs->lfs_suflags[1][i] &
+		     (SEGUSE_ACTIVE | SEGUSE_DIRTY | SEGUSE_EMPTY)) ==
+		    (SEGUSE_DIRTY | SEGUSE_EMPTY)) {
+
+			/* Make sure the sb is written before we clean */
+			mutex_enter(&lfs_lock);
+			s = splbio();
+			while (waited == 0 && fs->lfs_sbactive)
+				mtsleep(&fs->lfs_sbactive, PRIBIO+1, "lfs asb",
+					0, &lfs_lock);
+			splx(s);
+			mutex_exit(&lfs_lock);
+			waited = 1;
+
+			if ((error = lfs_do_segclean(fs, i)) != 0) {
+				DLOG((DLOG_CLEAN, "lfs_auto_segclean: lfs_do_segclean returned %d for seg %d\n", error, i));
+			}
+		}
+		fs->lfs_suflags[1 - fs->lfs_activesb][i] =
+			fs->lfs_suflags[fs->lfs_activesb][i];
+	}
+}
+
+/*
+ * lfs_segunlock --
+ *	Single thread the segment writer.
+ */
+void
+lfs_segunlock(struct lfs *fs)
+{
+	struct segment *sp;
+	unsigned long sync, ckp;
+	struct buf *bp;
+	int do_unmark_dirop = 0;
+
+	sp = fs->lfs_sp;
+
+	mutex_enter(&lfs_lock);
+	KASSERT(LFS_SEGLOCK_HELD(fs));
+	if (fs->lfs_seglock == 1) {
+		if ((sp->seg_flags & (SEGM_PROT | SEGM_CLEAN)) == 0 &&
+		    LFS_STARVED_FOR_SEGS(fs) == 0)
+			do_unmark_dirop = 1;
+		mutex_exit(&lfs_lock);
+		sync = sp->seg_flags & SEGM_SYNC;
+		ckp = sp->seg_flags & SEGM_CKP;
+
+		/* We should have a segment summary, and nothing else */
+		KASSERT(sp->cbpp == sp->bpp + 1);
+
+		/* Free allocated segment summary */
+		fs->lfs_offset -= btofsb(fs, fs->lfs_sumsize);
+		bp = *sp->bpp;
+		lfs_freebuf(fs, bp);
+
+		pool_put(&fs->lfs_bpppool, sp->bpp);
+		sp->bpp = NULL;
+
+		/*
+		 * If we're not sync, we're done with sp, get rid of it.
+		 * Otherwise, we keep a local copy around but free
+		 * fs->lfs_sp so another process can use it (we have to
+		 * wait but they don't have to wait for us).
+		 */
+		if (!sync)
+			pool_put(&fs->lfs_segpool, sp);
+		fs->lfs_sp = NULL;
+
+		/*
+		 * If the I/O count is non-zero, sleep until it reaches zero.
+		 * At the moment, the user's process hangs around so we can
+		 * sleep.
+		 */
+		mutex_enter(&lfs_lock);
+		if (--fs->lfs_iocount == 0) {
+			LFS_DEBUG_COUNTLOCKED("lfs_segunlock");
+		}
+		if (fs->lfs_iocount <= 1)
+			wakeup(&fs->lfs_iocount);
+		mutex_exit(&lfs_lock);
+		/*
+		 * If we're not checkpointing, we don't have to block
+		 * other processes to wait for a synchronous write
+		 * to complete.
+		 */
+		if (!ckp) {
+#ifdef DEBUG
+			LFS_ENTER_LOG("segunlock_std", __FILE__, __LINE__, 0, 0, curproc->p_pid);
+#endif
+			mutex_enter(&lfs_lock);
+			--fs->lfs_seglock;
+			fs->lfs_lockpid = 0;
+			fs->lfs_locklwp = 0;
+			mutex_exit(&lfs_lock);
+			wakeup(&fs->lfs_seglock);
+		}
+		/*
+		 * We let checkpoints happen asynchronously.  That means
+		 * that during recovery, we have to roll forward between
+		 * the two segments described by the first and second
+		 * superblocks to make sure that the checkpoint described
+		 * by a superblock completed.
+		 */
+		mutex_enter(&lfs_lock);
+		while (ckp && sync && fs->lfs_iocount) {
+			(void)mtsleep(&fs->lfs_iocount, PRIBIO + 1,
+				      "lfs_iocount", 0, &lfs_lock);
+			DLOG((DLOG_SEG, "sleeping on iocount %x == %d\n", fs, fs->lfs_iocount));
+		}
+		while (sync && sp->seg_iocount) {
+			(void)mtsleep(&sp->seg_iocount, PRIBIO + 1,
+				     "seg_iocount", 0, &lfs_lock);
+			DLOG((DLOG_SEG, "sleeping on iocount %x == %d\n", sp, sp->seg_iocount));
+		}
+		mutex_exit(&lfs_lock);
+		if (sync)
+			pool_put(&fs->lfs_segpool, sp);
+
+		if (ckp) {
+			fs->lfs_nactive = 0;
+			/* If we *know* everything's on disk, write both sbs */
+			/* XXX should wait for this one	 */
+			if (sync)
+				lfs_writesuper(fs, fs->lfs_sboffs[fs->lfs_activesb]);
+			lfs_writesuper(fs, fs->lfs_sboffs[1 - fs->lfs_activesb]);
+			if (!(fs->lfs_ivnode->v_mount->mnt_iflag & IMNT_UNMOUNT)) {
+				lfs_auto_segclean(fs);
+				/* If sync, we can clean the remainder too */
+				if (sync)
+					lfs_auto_segclean(fs);
+			}
+			fs->lfs_activesb = 1 - fs->lfs_activesb;
+#ifdef DEBUG
+			LFS_ENTER_LOG("segunlock_ckp", __FILE__, __LINE__, 0, 0, curproc->p_pid);
+#endif
+			mutex_enter(&lfs_lock);
+			--fs->lfs_seglock;
+			fs->lfs_lockpid = 0;
+			fs->lfs_locklwp = 0;
+			mutex_exit(&lfs_lock);
+			wakeup(&fs->lfs_seglock);
+		}
+		/* Reenable fragment size changes */
+		rw_exit(&fs->lfs_fraglock);
+		if (do_unmark_dirop)
+			lfs_unmark_dirop(fs);
+	} else if (fs->lfs_seglock == 0) {
+		mutex_exit(&lfs_lock);
+		panic ("Seglock not held");
+	} else {
+		--fs->lfs_seglock;
+		mutex_exit(&lfs_lock);
+	}
+}
+
+/*
+ * Drain dirops and start writer.
+ *
+ * No simple_locks are held when we enter and none are held when we return.
+ */
+int
+lfs_writer_enter(struct lfs *fs, const char *wmesg)
+{
+	int error = 0;
+
+	ASSERT_MAYBE_SEGLOCK(fs);
+	mutex_enter(&lfs_lock);
+
+	/* disallow dirops during flush */
+	fs->lfs_writer++;
+
+	while (fs->lfs_dirops > 0) {
+		++fs->lfs_diropwait;
+		error = mtsleep(&fs->lfs_writer, PRIBIO+1, wmesg, 0,
+				&lfs_lock);
+		--fs->lfs_diropwait;
+	}
+
+	if (error)
+		fs->lfs_writer--;
+
+	mutex_exit(&lfs_lock);
+
+	return error;
+}
+
+void
+lfs_writer_leave(struct lfs *fs)
+{
+	bool dowakeup;
+
+	ASSERT_MAYBE_SEGLOCK(fs);
+	mutex_enter(&lfs_lock);
+	dowakeup = !(--fs->lfs_writer);
+	mutex_exit(&lfs_lock);
+	if (dowakeup)
+		wakeup(&fs->lfs_dirops);
+}
+
+/*
+ * Unlock, wait for the cleaner, then relock to where we were before.
+ * To be used only at a fairly high level, to address a paucity of free
+ * segments propagated back from lfs_gop_write().
+ */
+void
+lfs_segunlock_relock(struct lfs *fs)
+{
+	int n = fs->lfs_seglock;
+	u_int16_t seg_flags;
+	CLEANERINFO *cip;
+	struct buf *bp;
+
+	if (n == 0)
+		return;
+
+	/* Write anything we've already gathered to disk */
+	lfs_writeseg(fs, fs->lfs_sp);
+
+	/* Tell cleaner */
+	LFS_CLEANERINFO(cip, fs, bp);
+	cip->flags |= LFS_CLEANER_MUST_CLEAN;
+	LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
+
+	/* Save segment flags for later */
+	seg_flags = fs->lfs_sp->seg_flags;
+
+	fs->lfs_sp->seg_flags |= SEGM_PROT; /* Don't unmark dirop nodes */
+	while(fs->lfs_seglock)
+		lfs_segunlock(fs);
+
+	/* Wait for the cleaner */
+	lfs_wakeup_cleaner(fs);
+	mutex_enter(&lfs_lock);
+	while (LFS_STARVED_FOR_SEGS(fs))
+		mtsleep(&fs->lfs_avail, PRIBIO, "relock", 0,
+			&lfs_lock);
+	mutex_exit(&lfs_lock);
+
+	/* Put the segment lock back the way it was. */
+	while(n--)
+		lfs_seglock(fs, seg_flags);
+
+	/* Cleaner can relax now */
+	LFS_CLEANERINFO(cip, fs, bp);
+	cip->flags &= ~LFS_CLEANER_MUST_CLEAN;
+	LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
+
+	return;
+}
+
+/*
+ * Wake up the cleaner, provided that nowrap is not set.
+ */
+void
+lfs_wakeup_cleaner(struct lfs *fs)
+{
+	if (fs->lfs_nowrap > 0)
+		return;
+
+	wakeup(&fs->lfs_nextseg);
+	wakeup(&lfs_allclean_wakeup);
+}
diff --git a/sys/ufs/lfs/lfs_syscalls.c b/sys/ufs/lfs/lfs_syscalls.c
new file mode 100644
index 000000000..442b81d46
--- /dev/null
+++ b/sys/ufs/lfs/lfs_syscalls.c
@@ -0,0 +1,1224 @@
+/*	$NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $	*/
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008
+ *    The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 1991, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)lfs_syscalls.c	8.10 (Berkeley) 5/14/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $");
+
+#ifndef LFS
+# define LFS		/* for prototypes in syscallargs.h */
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/kernel.h>
+#include <sys/kauth.h>
+#include <sys/syscallargs.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, void *);
+int lfs_fasthashget(dev_t, ino_t, struct vnode **);
+
+pid_t lfs_cleaner_pid = 0;
+
+/*
+ * sys_lfs_markv:
+ *
+ * This will mark inodes and blocks dirty, so they are written into the log.
+ * It will block until all the blocks have been written.  The segment create
+ * time passed in the block_info and inode_info structures is used to decide
+ * if the data is valid for each block (in case some process dirtied a block
+ * or inode that is being cleaned between the determination that a block is
+ * live and the lfs_markv call).
+ *
+ *  0 on success
+ * -1/errno is return on error.
+ */
+#ifdef USE_64BIT_SYSCALLS
+int
+sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval)
+{
+	/* {
+		syscallarg(fsid_t *) fsidp;
+		syscallarg(struct block_info *) blkiov;
+		syscallarg(int) blkcnt;
+	} */
+	BLOCK_INFO *blkiov;
+	int blkcnt, error;
+	fsid_t fsid;
+	struct lfs *fs;
+	struct mount *mntp;
+
+	if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+	    NULL)) != 0)
+		return (error);
+
+	if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+		return (error);
+
+	if ((mntp = vfs_getvfs(fsidp)) == NULL) 
+		return (ENOENT);
+	fs = VFSTOUFS(mntp)->um_lfs;
+
+	blkcnt = SCARG(uap, blkcnt);
+	if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
+		return (EINVAL);
+
+	KERNEL_LOCK(1, NULL);
+	blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+	if ((error = copyin(SCARG(uap, blkiov), blkiov,
+			    blkcnt * sizeof(BLOCK_INFO))) != 0)
+		goto out;
+
+	if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0)
+		copyout(blkiov, SCARG(uap, blkiov),
+			blkcnt * sizeof(BLOCK_INFO));
+    out:
+	lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+	KERNEL_UNLOCK_ONE(NULL);
+	return error;
+}
+#else
+int
+sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval)
+{
+	/* {
+		syscallarg(fsid_t *) fsidp;
+		syscallarg(struct block_info *) blkiov;
+		syscallarg(int) blkcnt;
+	} */
+	BLOCK_INFO *blkiov;
+	BLOCK_INFO_15 *blkiov15;
+	int i, blkcnt, error;
+	fsid_t fsid;
+	struct lfs *fs;
+	struct mount *mntp;
+
+	if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+	    NULL)) != 0)
+		return (error);
+
+	if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+		return (error);
+
+	if ((mntp = vfs_getvfs(&fsid)) == NULL) 
+		return (ENOENT);
+	fs = VFSTOUFS(mntp)->um_lfs;
+
+	blkcnt = SCARG(uap, blkcnt);
+	if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
+		return (EINVAL);
+
+	KERNEL_LOCK(1, NULL);
+	blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+	blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
+	if ((error = copyin(SCARG(uap, blkiov), blkiov15,
+			    blkcnt * sizeof(BLOCK_INFO_15))) != 0)
+		goto out;
+
+	for (i = 0; i < blkcnt; i++) {
+		blkiov[i].bi_inode     = blkiov15[i].bi_inode;
+		blkiov[i].bi_lbn       = blkiov15[i].bi_lbn;
+		blkiov[i].bi_daddr     = blkiov15[i].bi_daddr;
+		blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
+		blkiov[i].bi_version   = blkiov15[i].bi_version;
+		blkiov[i].bi_bp	       = blkiov15[i].bi_bp;
+		blkiov[i].bi_size      = blkiov15[i].bi_size;
+	}
+
+	if ((error = lfs_markv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
+		for (i = 0; i < blkcnt; i++) {
+			blkiov15[i].bi_inode	 = blkiov[i].bi_inode;
+			blkiov15[i].bi_lbn	 = blkiov[i].bi_lbn;
+			blkiov15[i].bi_daddr	 = blkiov[i].bi_daddr;
+			blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
+			blkiov15[i].bi_version	 = blkiov[i].bi_version;
+			blkiov15[i].bi_bp	 = blkiov[i].bi_bp;
+			blkiov15[i].bi_size	 = blkiov[i].bi_size;
+		}
+		copyout(blkiov15, SCARG(uap, blkiov),
+			blkcnt * sizeof(BLOCK_INFO_15));
+	}
+    out:
+	lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+	lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
+	KERNEL_UNLOCK_ONE(NULL);
+	return error;
+}
+#endif
+
+#define	LFS_MARKV_MAX_BLOCKS	(LFS_MAX_BUFS)
+
+int
+lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov,
+    int blkcnt)
+{
+	BLOCK_INFO *blkp;
+	IFILE *ifp;
+	struct buf *bp;
+	struct inode *ip = NULL;
+	struct lfs *fs;
+	struct mount *mntp;
+	struct vnode *vp = NULL;
+	ino_t lastino;
+	daddr_t b_daddr, v_daddr;
+	int cnt, error;
+	int do_again = 0;
+	int numrefed = 0;
+	ino_t maxino;
+	size_t obsize;
+
+	/* number of blocks/inodes that we have already bwrite'ed */
+	int nblkwritten, ninowritten;
+
+	if ((mntp = vfs_getvfs(fsidp)) == NULL)
+		return (ENOENT);
+
+	fs = VFSTOUFS(mntp)->um_lfs;
+
+	if (fs->lfs_ronly)
+		return EROFS;
+
+	maxino = (fragstoblks(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks) -
+		      fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
+
+	cnt = blkcnt;
+
+	if ((error = vfs_busy(mntp, NULL)) != 0)
+		return (error);
+
+	/*
+	 * This seglock is just to prevent the fact that we might have to sleep
+	 * from allowing the possibility that our blocks might become
+	 * invalid.
+	 *
+	 * It is also important to note here that unless we specify SEGM_CKP,
+	 * any Ifile blocks that we might be asked to clean will never get
+	 * to the disk.
+	 */
+	lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
+
+	/* Mark blocks/inodes dirty.  */
+	error = 0;
+
+	/* these were inside the initialization for the for loop */
+	v_daddr = LFS_UNUSED_DADDR;
+	lastino = LFS_UNUSED_INUM;
+	nblkwritten = ninowritten = 0;
+	for (blkp = blkiov; cnt--; ++blkp)
+	{
+		/* Bounds-check incoming data, avoid panic for failed VGET */
+		if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) {
+			error = EINVAL;
+			goto err3;
+		}
+		/*
+		 * Get the IFILE entry (only once) and see if the file still
+		 * exists.
+		 */
+		if (lastino != blkp->bi_inode) {
+			/*
+			 * Finish the old file, if there was one.  The presence
+			 * of a usable vnode in vp is signaled by a valid v_daddr.
+			 */
+			if (v_daddr != LFS_UNUSED_DADDR) {
+				lfs_vunref(vp);
+				numrefed--;
+			}
+
+			/*
+			 * Start a new file
+			 */
+			lastino = blkp->bi_inode;
+			if (blkp->bi_inode == LFS_IFILE_INUM)
+				v_daddr = fs->lfs_idaddr;
+			else {
+				LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
+				/* XXX fix for force write */
+				v_daddr = ifp->if_daddr;
+				brelse(bp, 0);
+			}
+			if (v_daddr == LFS_UNUSED_DADDR)
+				continue;
+
+			/* Get the vnode/inode. */
+			error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr,
+					   &vp,
+					   (blkp->bi_lbn == LFS_UNUSED_LBN
+					    ? blkp->bi_bp
+					    : NULL));
+
+			if (!error) {
+				numrefed++;
+			}
+			if (error) {
+				DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget"
+				      " failed with %d (ino %d, segment %d)\n",
+				      error, blkp->bi_inode,
+				      dtosn(fs, blkp->bi_daddr)));
+				/*
+				 * If we got EAGAIN, that means that the
+				 * Inode was locked.  This is
+				 * recoverable: just clean the rest of
+				 * this segment, and let the cleaner try
+				 * again with another.	(When the
+				 * cleaner runs again, this segment will
+				 * sort high on the list, since it is
+				 * now almost entirely empty.) But, we
+				 * still set v_daddr = LFS_UNUSED_ADDR
+				 * so as not to test this over and over
+				 * again.
+				 */
+				if (error == EAGAIN) {
+					error = 0;
+					do_again++;
+				}
+#ifdef DIAGNOSTIC
+				else if (error != ENOENT)
+					panic("lfs_markv VFS_VGET FAILED");
+#endif
+				/* lastino = LFS_UNUSED_INUM; */
+				v_daddr = LFS_UNUSED_DADDR;
+				vp = NULL;
+				ip = NULL;
+				continue;
+			}
+			ip = VTOI(vp);
+			ninowritten++;
+		} else if (v_daddr == LFS_UNUSED_DADDR) {
+			/*
+			 * This can only happen if the vnode is dead (or
+			 * in any case we can't get it...e.g., it is
+			 * inlocked).  Keep going.
+			 */
+			continue;
+		}
+
+		/* Past this point we are guaranteed that vp, ip are valid. */
+
+		/* Can't clean VU_DIROP directories in case of truncation */
+		/* XXX - maybe we should mark removed dirs specially? */
+		if (vp->v_type == VDIR && (vp->v_uflag & VU_DIROP)) {
+			do_again++;
+			continue;
+		}
+
+		/* If this BLOCK_INFO didn't contain a block, keep going. */
+		if (blkp->bi_lbn == LFS_UNUSED_LBN) {
+			/* XXX need to make sure that the inode gets written in this case */
+			/* XXX but only write the inode if it's the right one */
+			if (blkp->bi_inode != LFS_IFILE_INUM) {
+				LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
+				if (ifp->if_daddr == blkp->bi_daddr) {
+					mutex_enter(&lfs_lock);
+					LFS_SET_UINO(ip, IN_CLEANING);
+					mutex_exit(&lfs_lock);
+				}
+				brelse(bp, 0);
+			}
+			continue;
+		}
+
+		b_daddr = 0;
+		if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) ||
+		    dbtofsb(fs, b_daddr) != blkp->bi_daddr)
+		{
+			if (dtosn(fs, dbtofsb(fs, b_daddr)) ==
+			    dtosn(fs, blkp->bi_daddr))
+			{
+				DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %llx vs %llx\n",
+				      (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr)));
+			}
+			do_again++;
+			continue;
+		}
+
+		/*
+		 * Check block sizes.  The blocks being cleaned come from
+		 * disk, so they should have the same size as their on-disk
+		 * counterparts.
+		 */
+		if (blkp->bi_lbn >= 0)
+			obsize = blksize(fs, ip, blkp->bi_lbn);
+		else
+			obsize = fs->lfs_bsize;
+		/* Check for fragment size change */
+		if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) {
+			obsize = ip->i_lfs_fragsize[blkp->bi_lbn];
+		}
+		if (obsize != blkp->bi_size) {
+			DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %lld wrong"
+			      " size (%ld != %d), try again\n",
+			      blkp->bi_inode, (long long)blkp->bi_lbn,
+			      (long) obsize, blkp->bi_size));
+			do_again++;
+			continue;
+		}
+
+		/*
+		 * If we get to here, then we are keeping the block.  If
+		 * it is an indirect block, we want to actually put it
+		 * in the buffer cache so that it can be updated in the
+		 * finish_meta section.	 If it's not, we need to
+		 * allocate a fake buffer so that writeseg can perform
+		 * the copyin and write the buffer.
+		 */
+		if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) {
+			/* Data Block */
+			bp = lfs_fakebuf(fs, vp, blkp->bi_lbn,
+					 blkp->bi_size, blkp->bi_bp);
+			/* Pretend we used bread() to get it */
+			bp->b_blkno = fsbtodb(fs, blkp->bi_daddr);
+		} else {
+			/* Indirect block or ifile */
+			if (blkp->bi_size != fs->lfs_bsize &&
+			    ip->i_number != LFS_IFILE_INUM)
+				panic("lfs_markv: partial indirect block?"
+				    " size=%d\n", blkp->bi_size);
+			bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
+			if (!(bp->b_oflags & (BO_DONE|BO_DELWRI))) {
+				/*
+				 * The block in question was not found
+				 * in the cache; i.e., the block that
+				 * getblk() returned is empty.	So, we
+				 * can (and should) copy in the
+				 * contents, because we've already
+				 * determined that this was the right
+				 * version of this block on disk.
+				 *
+				 * And, it can't have changed underneath
+				 * us, because we have the segment lock.
+				 */
+				error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size);
+				if (error)
+					goto err2;
+			}
+		}
+		if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0)
+			goto err2;
+
+		nblkwritten++;
+		/*
+		 * XXX should account indirect blocks and ifile pages as well
+		 */
+		if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode))
+		    > LFS_MARKV_MAX_BLOCKS) {
+			DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n",
+			      nblkwritten, ninowritten));
+			lfs_segwrite(mntp, SEGM_CLEAN);
+			nblkwritten = ninowritten = 0;
+		}
+	}
+
+	/*
+	 * Finish the old file, if there was one
+	 */
+	if (v_daddr != LFS_UNUSED_DADDR) {
+		lfs_vunref(vp);
+		numrefed--;
+	}
+
+#ifdef DIAGNOSTIC
+	if (numrefed != 0)
+		panic("lfs_markv: numrefed=%d", numrefed);
+#endif
+	DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n",
+	      nblkwritten, ninowritten));
+
+	/*
+	 * The last write has to be SEGM_SYNC, because of calling semantics.
+	 * It also has to be SEGM_CKP, because otherwise we could write
+	 * over the newly cleaned data contained in a checkpoint, and then
+	 * we'd be unhappy at recovery time.
+	 */
+	lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
+
+	lfs_segunlock(fs);
+
+	vfs_unbusy(mntp, false, NULL);
+	if (error)
+		return (error);
+	else if (do_again)
+		return EAGAIN;
+
+	return 0;
+
+err2:
+	DLOG((DLOG_CLEAN, "lfs_markv err2\n"));
+
+	/*
+	 * XXX we're here because copyin() failed.
+	 * XXX it means that we can't trust the cleanerd.  too bad.
+	 * XXX how can we recover from this?
+	 */
+
+err3:
+	KERNEL_UNLOCK_ONE(NULL);
+	/*
+	 * XXX should do segwrite here anyway?
+	 */
+
+	if (v_daddr != LFS_UNUSED_DADDR) {
+		lfs_vunref(vp);
+		--numrefed;
+	}
+
+	lfs_segunlock(fs);
+	vfs_unbusy(mntp, false, NULL);
+#ifdef DIAGNOSTIC
+	if (numrefed != 0)
+		panic("lfs_markv: numrefed=%d", numrefed);
+#endif
+
+	return (error);
+}
+
+/*
+ * sys_lfs_bmapv:
+ *
+ * This will fill in the current disk address for arrays of blocks.
+ *
+ *  0 on success
+ * -1/errno is return on error.
+ */
+#ifdef USE_64BIT_SYSCALLS
+int
+sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval)
+{
+	/* {
+		syscallarg(fsid_t *) fsidp;
+		syscallarg(struct block_info *) blkiov;
+		syscallarg(int) blkcnt;
+	} */
+	BLOCK_INFO *blkiov;
+	int blkcnt, error;
+	fsid_t fsid;
+	struct lfs *fs;
+	struct mount *mntp;
+
+	if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+	    NULL)) != 0)
+		return (error);
+
+	if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+		return (error);
+
+	if ((mntp = vfs_getvfs(&fsid)) == NULL) 
+		return (ENOENT);
+	fs = VFSTOUFS(mntp)->um_lfs;
+
+	blkcnt = SCARG(uap, blkcnt);
+	if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
+		return (EINVAL);
+	KERNEL_LOCK(1, NULL);
+	blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+	if ((error = copyin(SCARG(uap, blkiov), blkiov,
+			    blkcnt * sizeof(BLOCK_INFO))) != 0)
+		goto out;
+
+	if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0)
+		copyout(blkiov, SCARG(uap, blkiov),
+			blkcnt * sizeof(BLOCK_INFO));
+    out:
+	lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+	KERNEL_UNLOCK_ONE(NULL);
+	return error;
+}
+#else
+int
+sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval)
+{
+	/* {
+		syscallarg(fsid_t *) fsidp;
+		syscallarg(struct block_info *) blkiov;
+		syscallarg(int) blkcnt;
+	} */
+	BLOCK_INFO *blkiov;
+	BLOCK_INFO_15 *blkiov15;
+	int i, blkcnt, error;
+	fsid_t fsid;
+	struct lfs *fs;
+	struct mount *mntp;
+
+	if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+	    NULL)) != 0)
+		return (error);
+
+	if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+		return (error);
+
+	if ((mntp = vfs_getvfs(&fsid)) == NULL) 
+		return (ENOENT);
+	fs = VFSTOUFS(mntp)->um_lfs;
+
+	blkcnt = SCARG(uap, blkcnt);
+	if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
+		return (EINVAL);
+	KERNEL_LOCK(1, NULL);
+	blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+	blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
+	if ((error = copyin(SCARG(uap, blkiov), blkiov15,
+			    blkcnt * sizeof(BLOCK_INFO_15))) != 0)
+		goto out;
+
+	for (i = 0; i < blkcnt; i++) {
+		blkiov[i].bi_inode     = blkiov15[i].bi_inode;
+		blkiov[i].bi_lbn       = blkiov15[i].bi_lbn;
+		blkiov[i].bi_daddr     = blkiov15[i].bi_daddr;
+		blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
+		blkiov[i].bi_version   = blkiov15[i].bi_version;
+		blkiov[i].bi_bp	       = blkiov15[i].bi_bp;
+		blkiov[i].bi_size      = blkiov15[i].bi_size;
+	}
+
+	if ((error = lfs_bmapv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
+		for (i = 0; i < blkcnt; i++) {
+			blkiov15[i].bi_inode	 = blkiov[i].bi_inode;
+			blkiov15[i].bi_lbn	 = blkiov[i].bi_lbn;
+			blkiov15[i].bi_daddr	 = blkiov[i].bi_daddr;
+			blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
+			blkiov15[i].bi_version	 = blkiov[i].bi_version;
+			blkiov15[i].bi_bp	 = blkiov[i].bi_bp;
+			blkiov15[i].bi_size	 = blkiov[i].bi_size;
+		}
+		copyout(blkiov15, SCARG(uap, blkiov),
+			blkcnt * sizeof(BLOCK_INFO_15));
+	}
+    out:
+	lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+	lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
+	KERNEL_UNLOCK_ONE(NULL);
+	return error;
+}
+#endif
+
+int
+lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
+{
+	BLOCK_INFO *blkp;
+	IFILE *ifp;
+	struct buf *bp;
+	struct inode *ip = NULL;
+	struct lfs *fs;
+	struct mount *mntp;
+	struct ufsmount *ump;
+	struct vnode *vp;
+	ino_t lastino;
+	daddr_t v_daddr;
+	int cnt, error;
+	int numrefed = 0;
+
+	lfs_cleaner_pid = p->p_pid;
+
+	if ((mntp = vfs_getvfs(fsidp)) == NULL)
+		return (ENOENT);
+
+	ump = VFSTOUFS(mntp);
+	if ((error = vfs_busy(mntp, NULL)) != 0)
+		return (error);
+
+	cnt = blkcnt;
+
+	fs = VFSTOUFS(mntp)->um_lfs;
+
+	error = 0;
+
+	/* these were inside the initialization for the for loop */
+	v_daddr = LFS_UNUSED_DADDR;
+	lastino = LFS_UNUSED_INUM;
+	for (blkp = blkiov; cnt--; ++blkp)
+	{
+		/*
+		 * Get the IFILE entry (only once) and see if the file still
+		 * exists.
+		 */
+		if (lastino != blkp->bi_inode) {
+			/*
+			 * Finish the old file, if there was one.  The presence
+			 * of a usable vnode in vp is signaled by a valid
+			 * v_daddr.
+			 */
+			if (v_daddr != LFS_UNUSED_DADDR) {
+				lfs_vunref(vp);
+				numrefed--;
+			}
+
+			/*
+			 * Start a new file
+			 */
+			lastino = blkp->bi_inode;
+			if (blkp->bi_inode == LFS_IFILE_INUM)
+				v_daddr = fs->lfs_idaddr;
+			else {
+				LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
+				v_daddr = ifp->if_daddr;
+				brelse(bp, 0);
+			}
+			if (v_daddr == LFS_UNUSED_DADDR) {
+				blkp->bi_daddr = LFS_UNUSED_DADDR;
+				continue;
+			}
+			/*
+			 * A regular call to VFS_VGET could deadlock
+			 * here.  Instead, we try an unlocked access.
+			 */
+			mutex_enter(&ufs_ihash_lock);
+			vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode);
+			if (vp != NULL && !(vp->v_iflag & VI_XLOCK)) {
+				ip = VTOI(vp);
+				mutex_enter(vp->v_interlock);
+				mutex_exit(&ufs_ihash_lock);
+				if (lfs_vref(vp)) {
+					v_daddr = LFS_UNUSED_DADDR;
+					continue;
+				}
+				numrefed++;
+			} else {
+				mutex_exit(&ufs_ihash_lock);
+				/*
+				 * Don't VFS_VGET if we're being unmounted,
+				 * since we hold vfs_busy().
+				 */
+				if (mntp->mnt_iflag & IMNT_UNMOUNT) {
+					v_daddr = LFS_UNUSED_DADDR;
+					continue;
+				}
+				error = VFS_VGET(mntp, blkp->bi_inode, &vp);
+				if (error) {
+					DLOG((DLOG_CLEAN, "lfs_bmapv: vget ino"
+					      "%d failed with %d",
+					      blkp->bi_inode,error));
+					v_daddr = LFS_UNUSED_DADDR;
+					continue;
+				} else {
+					KASSERT(VOP_ISLOCKED(vp));
+					VOP_UNLOCK(vp);
+					numrefed++;
+				}
+			}
+			ip = VTOI(vp);
+		} else if (v_daddr == LFS_UNUSED_DADDR) {
+			/*
+			 * This can only happen if the vnode is dead.
+			 * Keep going.	Note that we DO NOT set the
+			 * bi_addr to anything -- if we failed to get
+			 * the vnode, for example, we want to assume
+			 * conservatively that all of its blocks *are*
+			 * located in the segment in question.
+			 * lfs_markv will throw them out if we are
+			 * wrong.
+			 */
+			/* blkp->bi_daddr = LFS_UNUSED_DADDR; */
+			continue;
+		}
+
+		/* Past this point we are guaranteed that vp, ip are valid. */
+
+		if (blkp->bi_lbn == LFS_UNUSED_LBN) {
+			/*
+			 * We just want the inode address, which is
+			 * conveniently in v_daddr.
+			 */
+			blkp->bi_daddr = v_daddr;
+		} else {
+			daddr_t bi_daddr;
+
+			/* XXX ondisk32 */
+			error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
+					 &bi_daddr, NULL);
+			if (error)
+			{
+				blkp->bi_daddr = LFS_UNUSED_DADDR;
+				continue;
+			}
+			blkp->bi_daddr = dbtofsb(fs, bi_daddr);
+			/* Fill in the block size, too */
+			if (blkp->bi_lbn >= 0)
+				blkp->bi_size = blksize(fs, ip, blkp->bi_lbn);
+			else
+				blkp->bi_size = fs->lfs_bsize;
+		}
+	}
+
+	/*
+	 * Finish the old file, if there was one.  The presence
+	 * of a usable vnode in vp is signaled by a valid v_daddr.
+	 */
+	if (v_daddr != LFS_UNUSED_DADDR) {
+		lfs_vunref(vp);
+		numrefed--;
+	}
+
+#ifdef DIAGNOSTIC
+	if (numrefed != 0)
+		panic("lfs_bmapv: numrefed=%d", numrefed);
+#endif
+
+	vfs_unbusy(mntp, false, NULL);
+
+	return 0;
+}
+
+/*
+ * sys_lfs_segclean:
+ *
+ * Mark the segment clean.
+ *
+ *  0 on success
+ * -1/errno is return on error.
+ */
+int
+sys_lfs_segclean(struct lwp *l, const struct sys_lfs_segclean_args *uap, register_t *retval)
+{
+	/* {
+		syscallarg(fsid_t *) fsidp;
+		syscallarg(u_long) segment;
+	} */
+	struct lfs *fs;
+	struct mount *mntp;
+	fsid_t fsid;
+	int error;
+	unsigned long segnum;
+
+	if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+	    NULL)) != 0)
+		return (error);
+
+	if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+		return (error);
+	if ((mntp = vfs_getvfs(&fsid)) == NULL)
+		return (ENOENT);
+
+	fs = VFSTOUFS(mntp)->um_lfs;
+	segnum = SCARG(uap, segment);
+
+	if ((error = vfs_busy(mntp, NULL)) != 0)
+		return (error);
+
+	KERNEL_LOCK(1, NULL);
+	lfs_seglock(fs, SEGM_PROT);
+	error = lfs_do_segclean(fs, segnum);
+	lfs_segunlock(fs);
+	KERNEL_UNLOCK_ONE(NULL);
+	vfs_unbusy(mntp, false, NULL);
+	return error;
+}
+
+/*
+ * Actually mark the segment clean.
+ * Must be called with the segment lock held.
+ */
+int
+lfs_do_segclean(struct lfs *fs, unsigned long segnum)
+{
+	extern int lfs_dostats;
+	struct buf *bp;
+	CLEANERINFO *cip;
+	SEGUSE *sup;
+
+	if (dtosn(fs, fs->lfs_curseg) == segnum) {
+		return (EBUSY);
+	}
+
+	LFS_SEGENTRY(sup, fs, segnum, bp);
+	if (sup->su_nbytes) {
+		DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
+		      " %d live bytes\n", segnum, sup->su_nbytes));
+		brelse(bp, 0);
+		return (EBUSY);
+	}
+	if (sup->su_flags & SEGUSE_ACTIVE) {
+		DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
+		      " segment is active\n", segnum));
+		brelse(bp, 0);
+		return (EBUSY);
+	}
+	if (!(sup->su_flags & SEGUSE_DIRTY)) {
+		DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
+		      " segment is already clean\n", segnum));
+		brelse(bp, 0);
+		return (EALREADY);
+	}
+
+	fs->lfs_avail += segtod(fs, 1);
+	if (sup->su_flags & SEGUSE_SUPERBLOCK)
+		fs->lfs_avail -= btofsb(fs, LFS_SBPAD);
+	if (fs->lfs_version > 1 && segnum == 0 &&
+	    fs->lfs_start < btofsb(fs, LFS_LABELPAD))
+		fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
+	mutex_enter(&lfs_lock);
+	fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
+		btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
+	fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
+		btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
+	if (fs->lfs_dmeta < 0)
+		fs->lfs_dmeta = 0;
+	mutex_exit(&lfs_lock);
+	sup->su_flags &= ~SEGUSE_DIRTY;
+	LFS_WRITESEGENTRY(sup, fs, segnum, bp);
+
+	LFS_CLEANERINFO(cip, fs, bp);
+	++cip->clean;
+	--cip->dirty;
+	fs->lfs_nclean = cip->clean;
+	cip->bfree = fs->lfs_bfree;
+	mutex_enter(&lfs_lock);
+	cip->avail = fs->lfs_avail - fs->lfs_ravail - fs->lfs_favail;
+	wakeup(&fs->lfs_avail);
+	mutex_exit(&lfs_lock);
+	(void) LFS_BWRITE_LOG(bp);
+
+	if (lfs_dostats)
+		++lfs_stats.segs_reclaimed;
+
+	return (0);
+}
+
+/*
+ * This will block until a segment in file system fsid is written.  A timeout
+ * in milliseconds may be specified which will awake the cleaner automatically.
+ * An fsid of -1 means any file system, and a timeout of 0 means forever.
+ */
+int
+lfs_segwait(fsid_t *fsidp, struct timeval *tv)
+{
+	struct mount *mntp;
+	void *addr;
+	u_long timeout;
+	int error;
+
+	KERNEL_LOCK(1, NULL);
+	if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL)
+		addr = &lfs_allclean_wakeup;
+	else
+		addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg;
+	/*
+	 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}!
+	 * XXX IS THAT WHAT IS INTENDED?
+	 */
+	timeout = tvtohz(tv);
+	error = tsleep(addr, PCATCH | PVFS, "segment", timeout);
+	KERNEL_UNLOCK_ONE(NULL);
+	return (error == ERESTART ? EINTR : 0);
+}
+
+/*
+ * sys_lfs_segwait:
+ *
+ * System call wrapper around lfs_segwait().
+ *
+ *  0 on success
+ *  1 on timeout
+ * -1/errno is return on error.
+ */
+int
+sys___lfs_segwait50(struct lwp *l, const struct sys___lfs_segwait50_args *uap,
+    register_t *retval)
+{
+	/* {
+		syscallarg(fsid_t *) fsidp;
+		syscallarg(struct timeval *) tv;
+	} */
+	struct timeval atv;
+	fsid_t fsid;
+	int error;
+
+	/* XXX need we be su to segwait? */
+	if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+	    NULL)) != 0)
+		return (error);
+	if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+		return (error);
+
+	if (SCARG(uap, tv)) {
+		error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval));
+		if (error)
+			return (error);
+		if (itimerfix(&atv))
+			return (EINVAL);
+	} else /* NULL or invalid */
+		atv.tv_sec = atv.tv_usec = 0;
+	return lfs_segwait(&fsid, &atv);
+}
+
+/*
+ * VFS_VGET call specialized for the cleaner.  The cleaner already knows the
+ * daddr from the ifile, so don't look it up again.  If the cleaner is
+ * processing IINFO structures, it may have the ondisk inode already, so
+ * don't go retrieving it again.
+ *
+ * we lfs_vref, and it is the caller's responsibility to lfs_vunref
+ * when finished.
+ */
+
+int
+lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp)
+{
+	struct vnode *vp;
+
+	mutex_enter(&ufs_ihash_lock);
+	if ((vp = ufs_ihashlookup(dev, ino)) != NULL) {
+		mutex_enter(vp->v_interlock);
+		mutex_exit(&ufs_ihash_lock);
+		if (vp->v_iflag & VI_XLOCK) {
+			DLOG((DLOG_CLEAN, "lfs_fastvget: ino %d VI_XLOCK\n",
+			      ino));
+			lfs_stats.clean_vnlocked++;
+			mutex_exit(vp->v_interlock);
+			return EAGAIN;
+		}
+		if (lfs_vref(vp)) {
+			DLOG((DLOG_CLEAN, "lfs_fastvget: lfs_vref failed"
+			      " for ino %d\n", ino));
+			lfs_stats.clean_inlocked++;
+			return EAGAIN;
+		}
+	} else {
+		mutex_exit(&ufs_ihash_lock);
+	}
+	*vpp = vp;
+
+	return (0);
+}
+
+int
+lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp,
+	     struct ufs1_dinode *dinp)
+{
+	struct inode *ip;
+	struct ufs1_dinode *dip;
+	struct vnode *vp;
+	struct ufsmount *ump;
+	dev_t dev;
+	int error, retries;
+	struct buf *bp;
+	struct lfs *fs;
+
+	ump = VFSTOUFS(mp);
+	dev = ump->um_dev;
+	fs = ump->um_lfs;
+
+	/*
+	 * Wait until the filesystem is fully mounted before allowing vget
+	 * to complete.	 This prevents possible problems with roll-forward.
+	 */
+	mutex_enter(&lfs_lock);
+	while (fs->lfs_flags & LFS_NOTYET) {
+		mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0,
+			&lfs_lock);
+	}
+	mutex_exit(&lfs_lock);
+
+	/*
+	 * This is playing fast and loose.  Someone may have the inode
+	 * locked, in which case they are going to be distinctly unhappy
+	 * if we trash something.
+	 */
+
+	error = lfs_fasthashget(dev, ino, vpp);
+	if (error != 0 || *vpp != NULL)
+		return (error);
+
+	/*
+	 * getnewvnode(9) will call vfs_busy, which will block if the
+	 * filesystem is being unmounted; but umount(9) is waiting for
+	 * us because we're already holding the fs busy.
+	 * XXXMP
+	 */
+	if (mp->mnt_iflag & IMNT_UNMOUNT) {
+		*vpp = NULL;
+		return EDEADLK;
+	}
+	error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, NULL, &vp);
+	if (error) {
+		*vpp = NULL;
+		return (error);
+	}
+
+	mutex_enter(&ufs_hashlock);
+	error = lfs_fasthashget(dev, ino, vpp);
+	if (error != 0 || *vpp != NULL) {
+		mutex_exit(&ufs_hashlock);
+		ungetnewvnode(vp);
+		return (error);
+	}
+
+	/* Allocate new vnode/inode. */
+	lfs_vcreate(mp, ino, vp);
+
+	/*
+	 * Put it onto its hash chain and lock it so that other requests for
+	 * this inode will block if they arrive while we are sleeping waiting
+	 * for old data structures to be purged or for the contents of the
+	 * disk portion of this inode to be read.
+	 */
+	ip = VTOI(vp);
+	ufs_ihashins(ip);
+	mutex_exit(&ufs_hashlock);
+
+	/*
+	 * XXX
+	 * This may not need to be here, logically it should go down with
+	 * the i_devvp initialization.
+	 * Ask Kirk.
+	 */
+	ip->i_lfs = fs;
+
+	/* Read in the disk contents for the inode, copy into the inode. */
+	if (dinp) {
+		error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode));
+		if (error) {
+			DLOG((DLOG_CLEAN, "lfs_fastvget: dinode copyin failed"
+			      " for ino %d\n", ino));
+			ufs_ihashrem(ip);
+
+			/* Unlock and discard unneeded inode. */
+			VOP_UNLOCK(vp);
+			lfs_vunref(vp);
+			*vpp = NULL;
+			return (error);
+		}
+		if (ip->i_number != ino)
+			panic("lfs_fastvget: I was fed the wrong inode!");
+	} else {
+		retries = 0;
+	    again:
+		error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize,
+			      NOCRED, 0, &bp);
+		if (error) {
+			DLOG((DLOG_CLEAN, "lfs_fastvget: bread failed (%d)\n",
+			      error));
+			/*
+			 * The inode does not contain anything useful, so it
+			 * would be misleading to leave it on its hash chain.
+			 * Iput() will return it to the free list.
+			 */
+			ufs_ihashrem(ip);
+
+			/* Unlock and discard unneeded inode. */
+			VOP_UNLOCK(vp);
+			lfs_vunref(vp);
+			brelse(bp, 0);
+			*vpp = NULL;
+			return (error);
+		}
+		dip = lfs_ifind(ump->um_lfs, ino, bp);
+		if (dip == NULL) {
+			/* Assume write has not completed yet; try again */
+			brelse(bp, BC_INVAL);
+			++retries;
+			if (retries > LFS_IFIND_RETRIES)
+				panic("lfs_fastvget: dinode not found");
+			DLOG((DLOG_CLEAN, "lfs_fastvget: dinode not found,"
+			      " retrying...\n"));
+			goto again;
+		}
+		*ip->i_din.ffs1_din = *dip;
+		brelse(bp, 0);
+	}
+	lfs_vinit(mp, &vp);
+
+	*vpp = vp;
+
+	KASSERT(VOP_ISLOCKED(vp));
+	VOP_UNLOCK(vp);
+
+	return (0);
+}
+
+/*
+ * Make up a "fake" cleaner buffer, copy the data from userland into it.
+ */
+struct buf *
+lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, void *uaddr)
+{
+	struct buf *bp;
+	int error;
+
+	KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM);
+
+	bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN);
+	error = copyin(uaddr, bp->b_data, size);
+	if (error) {
+		lfs_freebuf(fs, bp);
+		return NULL;
+	}
+	KDASSERT(bp->b_iodone == lfs_callback);
+
+#if 0
+	mutex_enter(&lfs_lock);
+	++fs->lfs_iocount;
+	mutex_exit(&lfs_lock);
+#endif
+	bp->b_bufsize = size;
+	bp->b_bcount = size;
+	return (bp);
+}
diff --git a/sys/ufs/lfs/lfs_vfsops.c b/sys/ufs/lfs/lfs_vfsops.c
new file mode 100644
index 000000000..7769e94a1
--- /dev/null
+++ b/sys/ufs/lfs/lfs_vfsops.c
@@ -0,0 +1,2138 @@
+/*	$NetBSD: lfs_vfsops.c,v 1.291 2011/11/14 18:35:14 hannken Exp $	*/
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007
+ *     The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 1989, 1991, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)lfs_vfsops.c	8.20 (Berkeley) 6/10/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.291 2011/11/14 18:35:14 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_lfs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kthread.h>
+#include <sys/buf.h>
+#include <sys/device.h>
+#include <sys/mbuf.h>
+#include <sys/file.h>
+#include <sys/disklabel.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <uvm/uvm_extern.h>
+#include <sys/sysctl.h>
+#include <sys/conf.h>
+#include <sys/kauth.h>
+#include <sys/module.h>
+#include <sys/syscallvar.h>
+#include <sys/syscall.h>
+#include <sys/syscallargs.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_stat.h>
+#include <uvm/uvm_pager.h>
+#include <uvm/uvm_pdaemon.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/genfs/genfs_node.h>
+
+MODULE(MODULE_CLASS_VFS, lfs, "ffs");
+
+static int lfs_gop_write(struct vnode *, struct vm_page **, int, int);
+static bool lfs_issequential_hole(const struct ufsmount *,
+    daddr_t, daddr_t);
+
+static int lfs_mountfs(struct vnode *, struct mount *, struct lwp *);
+
+static struct sysctllog *lfs_sysctl_log;
+
+extern const struct vnodeopv_desc lfs_vnodeop_opv_desc;
+extern const struct vnodeopv_desc lfs_specop_opv_desc;
+extern const struct vnodeopv_desc lfs_fifoop_opv_desc;
+
+pid_t lfs_writer_daemon = 0;
+int lfs_do_flush = 0;
+#ifdef LFS_KERNEL_RFW
+int lfs_do_rfw = 0;
+#endif
+
+const struct vnodeopv_desc * const lfs_vnodeopv_descs[] = {
+	&lfs_vnodeop_opv_desc,
+	&lfs_specop_opv_desc,
+	&lfs_fifoop_opv_desc,
+	NULL,
+};
+
+struct vfsops lfs_vfsops = {
+	MOUNT_LFS,
+	sizeof (struct ufs_args),
+	lfs_mount,
+	ufs_start,
+	lfs_unmount,
+	ufs_root,
+	ufs_quotactl,
+	lfs_statvfs,
+	lfs_sync,
+	lfs_vget,
+	lfs_fhtovp,
+	lfs_vptofh,
+	lfs_init,
+	lfs_reinit,
+	lfs_done,
+	lfs_mountroot,
+	(int (*)(struct mount *, struct vnode *, struct timespec *)) eopnotsupp,
+	vfs_stdextattrctl,
+	(void *)eopnotsupp,	/* vfs_suspendctl */
+	genfs_renamelock_enter,
+	genfs_renamelock_exit,
+	(void *)eopnotsupp,
+	lfs_vnodeopv_descs,
+	0,
+	{ NULL, NULL },
+};
+
+const struct genfs_ops lfs_genfsops = {
+	.gop_size = lfs_gop_size,
+	.gop_alloc = ufs_gop_alloc,
+	.gop_write = lfs_gop_write,
+	.gop_markupdate = ufs_gop_markupdate,
+};
+
+static const struct ufs_ops lfs_ufsops = {
+	.uo_itimes = NULL,
+	.uo_update = lfs_update,
+	.uo_truncate = lfs_truncate,
+	.uo_valloc = lfs_valloc,
+	.uo_vfree = lfs_vfree,
+	.uo_balloc = lfs_balloc,
+	.uo_unmark_vnode = lfs_unmark_vnode,
+};
+
+struct shortlong {
+	const char *sname;
+	const char *lname;
+};
+
+static int
+sysctl_lfs_dostats(SYSCTLFN_ARGS)
+{
+	extern struct lfs_stats lfs_stats;
+	extern int lfs_dostats;
+	int error;
+
+	error = sysctl_lookup(SYSCTLFN_CALL(rnode));
+	if (error || newp == NULL)
+		return (error);
+
+	if (lfs_dostats == 0)
+		memset(&lfs_stats, 0, sizeof(lfs_stats));
+
+	return (0);
+}
+
+static void
+lfs_sysctl_setup(struct sysctllog **clog)
+{
+	int i;
+	extern int lfs_writeindir, lfs_dostats, lfs_clean_vnhead,
+		   lfs_fs_pagetrip, lfs_ignore_lazy_sync;
+#ifdef DEBUG
+	extern int lfs_debug_log_subsys[DLOG_MAX];
+	struct shortlong dlog_names[DLOG_MAX] = { /* Must match lfs.h ! */
+		{ "rollforward", "Debug roll-forward code" },
+		{ "alloc",	"Debug inode allocation and free list" },
+		{ "avail",	"Debug space-available-now accounting" },
+		{ "flush",	"Debug flush triggers" },
+		{ "lockedlist",	"Debug locked list accounting" },
+		{ "vnode_verbose", "Verbose per-vnode-written debugging" },
+		{ "vnode",	"Debug vnode use during segment write" },
+		{ "segment",	"Debug segment writing" },
+		{ "seguse",	"Debug segment used-bytes accounting" },
+		{ "cleaner",	"Debug cleaning routines" },
+		{ "mount",	"Debug mount/unmount routines" },
+		{ "pagecache",	"Debug UBC interactions" },
+		{ "dirop",	"Debug directory-operation accounting" },
+		{ "malloc",	"Debug private malloc accounting" },
+	};
+#endif /* DEBUG */
+	struct shortlong stat_names[] = { /* Must match lfs.h! */
+		{ "segsused",	    "Number of new segments allocated" },
+		{ "psegwrites",	    "Number of partial-segment writes" },
+		{ "psyncwrites",    "Number of synchronous partial-segment"
+				    " writes" },
+		{ "pcleanwrites",   "Number of partial-segment writes by the"
+				    " cleaner" },
+		{ "blocktot",       "Number of blocks written" },
+		{ "cleanblocks",    "Number of blocks written by the cleaner" },
+		{ "ncheckpoints",   "Number of checkpoints made" },
+		{ "nwrites",        "Number of whole writes" },
+		{ "nsync_writes",   "Number of synchronous writes" },
+		{ "wait_exceeded",  "Number of times writer waited for"
+				    " cleaner" },
+		{ "write_exceeded", "Number of times writer invoked flush" },
+		{ "flush_invoked",  "Number of times flush was invoked" },
+		{ "vflush_invoked", "Number of time vflush was called" },
+		{ "clean_inlocked", "Number of vnodes skipped for VI_XLOCK" },
+		{ "clean_vnlocked", "Number of vnodes skipped for vget failure" },
+		{ "segs_reclaimed", "Number of segments reclaimed" },
+	};
+
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_NODE, "vfs", NULL,
+		       NULL, 0, NULL, 0,
+		       CTL_VFS, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_NODE, "lfs",
+		       SYSCTL_DESCR("Log-structured file system"),
+		       NULL, 0, NULL, 0,
+		       CTL_VFS, 5, CTL_EOL);
+	/*
+	 * XXX the "5" above could be dynamic, thereby eliminating one
+	 * more instance of the "number to vfs" mapping problem, but
+	 * "5" is the order as taken from sys/mount.h
+	 */
+
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "flushindir", NULL,
+		       NULL, 0, &lfs_writeindir, 0,
+		       CTL_VFS, 5, LFS_WRITEINDIR, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "clean_vnhead", NULL,
+		       NULL, 0, &lfs_clean_vnhead, 0,
+		       CTL_VFS, 5, LFS_CLEAN_VNHEAD, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "dostats",
+		       SYSCTL_DESCR("Maintain statistics on LFS operations"),
+		       sysctl_lfs_dostats, 0, &lfs_dostats, 0,
+		       CTL_VFS, 5, LFS_DOSTATS, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "pagetrip",
+		       SYSCTL_DESCR("How many dirty pages in fs triggers"
+				    " a flush"),
+		       NULL, 0, &lfs_fs_pagetrip, 0,
+		       CTL_VFS, 5, LFS_FS_PAGETRIP, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "ignore_lazy_sync",
+		       SYSCTL_DESCR("Lazy Sync is ignored entirely"),
+		       NULL, 0, &lfs_ignore_lazy_sync, 0,
+		       CTL_VFS, 5, LFS_IGNORE_LAZY_SYNC, CTL_EOL);
+#ifdef LFS_KERNEL_RFW
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "rfw",
+		       SYSCTL_DESCR("Use in-kernel roll-forward on mount"),
+		       NULL, 0, &lfs_do_rfw, 0,
+		       CTL_VFS, 5, LFS_DO_RFW, CTL_EOL);
+#endif
+
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_NODE, "stats",
+		       SYSCTL_DESCR("Debugging options"),
+		       NULL, 0, NULL, 0,
+		       CTL_VFS, 5, LFS_STATS, CTL_EOL);
+	for (i = 0; i < sizeof(struct lfs_stats) / sizeof(u_int); i++) {
+		sysctl_createv(clog, 0, NULL, NULL,
+			       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
+			       CTLTYPE_INT, stat_names[i].sname,
+			       SYSCTL_DESCR(stat_names[i].lname),
+			       NULL, 0, &(((u_int *)&lfs_stats.segsused)[i]),
+			       0, CTL_VFS, 5, LFS_STATS, i, CTL_EOL);
+	}
+
+#ifdef DEBUG
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_NODE, "debug",
+		       SYSCTL_DESCR("Debugging options"),
+		       NULL, 0, NULL, 0,
+		       CTL_VFS, 5, LFS_DEBUGLOG, CTL_EOL);
+	for (i = 0; i < DLOG_MAX; i++) {
+		sysctl_createv(clog, 0, NULL, NULL,
+			       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+			       CTLTYPE_INT, dlog_names[i].sname,
+			       SYSCTL_DESCR(dlog_names[i].lname),
+			       NULL, 0, &(lfs_debug_log_subsys[i]), 0,
+			       CTL_VFS, 5, LFS_DEBUGLOG, i, CTL_EOL);
+	}
+#endif
+}
+
+/* old cleaner syscall interface.  see VOP_FCNTL() */
+static const struct syscall_package lfs_syscalls[] = {
+	{ SYS_lfs_bmapv,	0, (sy_call_t *)sys_lfs_bmapv		},
+	{ SYS_lfs_markv,	0, (sy_call_t *)sys_lfs_markv		},
+	{ SYS_lfs_segclean,	0, (sy_call_t *)sys___lfs_segwait50	},
+	{ 0, 0, NULL },
+};
+
+static int
+lfs_modcmd(modcmd_t cmd, void *arg)
+{
+	int error;
+
+	switch (cmd) {
+	case MODULE_CMD_INIT:
+		error = syscall_establish(NULL, lfs_syscalls);
+		if (error)
+			return error;
+		error = vfs_attach(&lfs_vfsops);
+		if (error != 0) {
+			syscall_disestablish(NULL, lfs_syscalls);
+			break;
+		}
+		lfs_sysctl_setup(&lfs_sysctl_log);
+		break;
+	case MODULE_CMD_FINI:
+		error = vfs_detach(&lfs_vfsops);
+		if (error != 0)
+			break;
+		syscall_disestablish(NULL, lfs_syscalls);
+		sysctl_teardown(&lfs_sysctl_log);
+		break;
+	default:
+		error = ENOTTY;
+		break;
+	}
+
+	return (error);
+}
+
+/*
+ * XXX Same structure as FFS inodes?  Should we share a common pool?
+ */
+struct pool lfs_inode_pool;
+struct pool lfs_dinode_pool;
+struct pool lfs_inoext_pool;
+struct pool lfs_lbnentry_pool;
+
+/*
+ * The writer daemon.  UVM keeps track of how many dirty pages we are holding
+ * in lfs_subsys_pages; the daemon flushes the filesystem when this value
+ * crosses the (user-defined) threshhold LFS_MAX_PAGES.
+ */
+static void
+lfs_writerd(void *arg)
+{
+	struct mount *mp, *nmp;
+	struct lfs *fs;
+	int fsflags;
+	int loopcount;
+
+	lfs_writer_daemon = curproc->p_pid;
+
+	mutex_enter(&lfs_lock);
+	for (;;) {
+		mtsleep(&lfs_writer_daemon, PVM | PNORELOCK, "lfswriter", hz/10,
+		    &lfs_lock);
+
+		/*
+		 * Look through the list of LFSs to see if any of them
+		 * have requested pageouts.
+		 */
+		mutex_enter(&mountlist_lock);
+		for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
+		     mp = nmp) {
+			if (vfs_busy(mp, &nmp)) {
+				continue;
+			}
+			if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
+			    sizeof(mp->mnt_stat.f_fstypename)) == 0) {
+				fs = VFSTOUFS(mp)->um_lfs;
+				mutex_enter(&lfs_lock);
+				fsflags = 0;
+				if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+				     lfs_dirvcount > LFS_MAX_DIROP) &&
+				    fs->lfs_dirops == 0)
+					fsflags |= SEGM_CKP;
+				if (fs->lfs_pdflush) {
+					DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n"));
+					fs->lfs_pdflush = 0;
+					lfs_flush_fs(fs, fsflags);
+					mutex_exit(&lfs_lock);
+				} else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) {
+					DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n"));
+					mutex_exit(&lfs_lock);
+					lfs_writer_enter(fs, "wrdirop");
+					lfs_flush_pchain(fs);
+					lfs_writer_leave(fs);
+				} else
+					mutex_exit(&lfs_lock);
+			}
+			vfs_unbusy(mp, false, &nmp);
+		}
+		mutex_exit(&mountlist_lock);
+
+		/*
+		 * If global state wants a flush, flush everything.
+		 */
+		mutex_enter(&lfs_lock);
+		loopcount = 0;
+		if (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS ||
+			locked_queue_bytes > LFS_MAX_BYTES ||
+			lfs_subsys_pages > LFS_MAX_PAGES) {
+
+			if (lfs_do_flush) {
+				DLOG((DLOG_FLUSH, "daemon: lfs_do_flush\n"));
+			}
+			if (locked_queue_count > LFS_MAX_BUFS) {
+				DLOG((DLOG_FLUSH, "daemon: lqc = %d, max %d\n",
+				      locked_queue_count, LFS_MAX_BUFS));
+			}
+			if (locked_queue_bytes > LFS_MAX_BYTES) {
+				DLOG((DLOG_FLUSH, "daemon: lqb = %ld, max %ld\n",
+				      locked_queue_bytes, LFS_MAX_BYTES));
+			}
+			if (lfs_subsys_pages > LFS_MAX_PAGES) {
+				DLOG((DLOG_FLUSH, "daemon: lssp = %d, max %d\n",
+				      lfs_subsys_pages, LFS_MAX_PAGES));
+			}
+
+			lfs_flush(NULL, SEGM_WRITERD, 0);
+			lfs_do_flush = 0;
+		}
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Initialize the filesystem, most work done by ufs_init.
+ */
+void
+lfs_init(void)
+{
+
+	malloc_type_attach(M_SEGMENT);
+	pool_init(&lfs_inode_pool, sizeof(struct inode), 0, 0, 0,
+	    "lfsinopl", &pool_allocator_nointr, IPL_NONE);
+	pool_init(&lfs_dinode_pool, sizeof(struct ufs1_dinode), 0, 0, 0,
+	    "lfsdinopl", &pool_allocator_nointr, IPL_NONE);
+	pool_init(&lfs_inoext_pool, sizeof(struct lfs_inode_ext), 8, 0, 0,
+	    "lfsinoextpl", &pool_allocator_nointr, IPL_NONE);
+	pool_init(&lfs_lbnentry_pool, sizeof(struct lbnentry), 0, 0, 0,
+	    "lfslbnpool", &pool_allocator_nointr, IPL_NONE);
+	ufs_init();
+
+#ifdef DEBUG
+	memset(lfs_log, 0, sizeof(lfs_log));
+#endif
+	mutex_init(&lfs_lock, MUTEX_DEFAULT, IPL_NONE);
+	cv_init(&locked_queue_cv, "lfsbuf");
+	cv_init(&lfs_writing_cv, "lfsflush");
+}
+
+void
+lfs_reinit(void)
+{
+	ufs_reinit();
+}
+
+void
+lfs_done(void)
+{
+	ufs_done();
+	mutex_destroy(&lfs_lock);
+	cv_destroy(&locked_queue_cv);
+	cv_destroy(&lfs_writing_cv);
+	pool_destroy(&lfs_inode_pool);
+	pool_destroy(&lfs_dinode_pool);
+	pool_destroy(&lfs_inoext_pool);
+	pool_destroy(&lfs_lbnentry_pool);
+	malloc_type_detach(M_SEGMENT);
+}
+
+/*
+ * Called by main() when ufs is going to be mounted as root.
+ */
+int
+lfs_mountroot(void)
+{
+	extern struct vnode *rootvp;
+	struct lfs *fs = NULL;				/* LFS */
+	struct mount *mp;
+	struct lwp *l = curlwp;
+	struct ufsmount *ump;
+	int error;
+
+	if (device_class(root_device) != DV_DISK)
+		return (ENODEV);
+
+	if (rootdev == NODEV)
+		return (ENODEV);
+	if ((error = vfs_rootmountalloc(MOUNT_LFS, "root_device", &mp))) {
+		vrele(rootvp);
+		return (error);
+	}
+	if ((error = lfs_mountfs(rootvp, mp, l))) {
+		vfs_unbusy(mp, false, NULL);
+		vfs_destroy(mp);
+		return (error);
+	}
+	mutex_enter(&mountlist_lock);
+	CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+	mutex_exit(&mountlist_lock);
+	ump = VFSTOUFS(mp);
+	fs = ump->um_lfs;
+	memset(fs->lfs_fsmnt, 0, sizeof(fs->lfs_fsmnt));
+	(void)copystr(mp->mnt_stat.f_mntonname, fs->lfs_fsmnt, MNAMELEN - 1, 0);
+	(void)lfs_statvfs(mp, &mp->mnt_stat);
+	vfs_unbusy(mp, false, NULL);
+	setrootfstime((time_t)(VFSTOUFS(mp)->um_lfs->lfs_tstamp));
+	return (0);
+}
+
+/*
+ * VFS Operations.
+ *
+ * mount system call
+ */
+int
+lfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
+{
+	struct lwp *l = curlwp;
+	struct vnode *devvp;
+	struct ufs_args *args = data;
+	struct ufsmount *ump = NULL;
+	struct lfs *fs = NULL;				/* LFS */
+	int error = 0, update;
+	mode_t accessmode;
+
+	if (*data_len < sizeof *args)
+		return EINVAL;
+
+	if (mp->mnt_flag & MNT_GETARGS) {
+		ump = VFSTOUFS(mp);
+		if (ump == NULL)
+			return EIO;
+		args->fspec = NULL;
+		*data_len = sizeof *args;
+		return 0;
+	}
+
+	update = mp->mnt_flag & MNT_UPDATE;
+
+	/* Check arguments */
+	if (args->fspec != NULL) {
+		/*
+		 * Look up the name and verify that it's sane.
+		 */
+		error = namei_simple_user(args->fspec,
+					NSM_FOLLOW_NOEMULROOT, &devvp);
+		if (error != 0)
+			return (error);
+
+		if (!update) {
+			/*
+			 * Be sure this is a valid block device
+			 */
+			if (devvp->v_type != VBLK)
+				error = ENOTBLK;
+			else if (bdevsw_lookup(devvp->v_rdev) == NULL)
+				error = ENXIO;
+		} else {
+			/*
+			 * Be sure we're still naming the same device
+			 * used for our initial mount
+			 */
+			ump = VFSTOUFS(mp);
+			if (devvp != ump->um_devvp) {
+				if (devvp->v_rdev != ump->um_devvp->v_rdev)
+					error = EINVAL;
+				else {
+					vrele(devvp);
+					devvp = ump->um_devvp;
+					vref(devvp);
+				}
+			}
+		}
+	} else {
+		if (!update) {
+			/* New mounts must have a filename for the device */
+			return (EINVAL);
+		} else {
+			/* Use the extant mount */
+			ump = VFSTOUFS(mp);
+			devvp = ump->um_devvp;
+			vref(devvp);
+		}
+	}
+
+
+	/*
+	 * If mount by non-root, then verify that user has necessary
+	 * permissions on the device.
+	 */
+	if (error == 0) {
+		accessmode = VREAD;
+		if (update ?
+		    (mp->mnt_iflag & IMNT_WANTRDWR) != 0 :
+		    (mp->mnt_flag & MNT_RDONLY) == 0)
+			accessmode |= VWRITE;
+		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+		error = genfs_can_mount(devvp, accessmode, l->l_cred);
+		VOP_UNLOCK(devvp);
+	}
+
+	if (error) {
+		vrele(devvp);
+		return (error);
+	}
+
+	if (!update) {
+		int flags;
+
+		if (mp->mnt_flag & MNT_RDONLY)
+			flags = FREAD;
+		else
+			flags = FREAD|FWRITE;
+		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+		error = VOP_OPEN(devvp, flags, FSCRED);
+		VOP_UNLOCK(devvp);
+		if (error)
+			goto fail;
+		error = lfs_mountfs(devvp, mp, l);		/* LFS */
+		if (error) {
+			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+			(void)VOP_CLOSE(devvp, flags, NOCRED);
+			VOP_UNLOCK(devvp);
+			goto fail;
+		}
+
+		ump = VFSTOUFS(mp);
+		fs = ump->um_lfs;
+	} else {
+		/*
+		 * Update the mount.
+		 */
+
+		/*
+		 * The initial mount got a reference on this
+		 * device, so drop the one obtained via
+		 * namei(), above.
+		 */
+		vrele(devvp);
+
+		ump = VFSTOUFS(mp);
+		fs = ump->um_lfs;
+		if (fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) {
+			/*
+			 * Changing from read-only to read/write.
+			 * Note in the superblocks that we're writing.
+			 */
+			fs->lfs_ronly = 0;
+			if (fs->lfs_pflags & LFS_PF_CLEAN) {
+				fs->lfs_pflags &= ~LFS_PF_CLEAN;
+				lfs_writesuper(fs, fs->lfs_sboffs[0]);
+				lfs_writesuper(fs, fs->lfs_sboffs[1]);
+			}
+		}
+		if (args->fspec == NULL)
+			return EINVAL;
+	}
+
+	error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
+	    UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+	if (error == 0)
+		(void)strncpy(fs->lfs_fsmnt, mp->mnt_stat.f_mntonname,
+			      sizeof(fs->lfs_fsmnt));
+	return error;
+
+fail:
+	vrele(devvp);
+	return (error);
+}
+
+
+/*
+ * Common code for mount and mountroot
+ * LFS specific
+ */
+int
+lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
+{
+	struct dlfs *tdfs, *dfs, *adfs;
+	struct lfs *fs;
+	struct ufsmount *ump;
+	struct vnode *vp;
+	struct buf *bp, *abp;
+	dev_t dev;
+	int error, i, ronly, fsbsize;
+	kauth_cred_t cred;
+	CLEANERINFO *cip;
+	SEGUSE *sup;
+	daddr_t sb_addr;
+
+	cred = l ? l->l_cred : NOCRED;
+
+	/*
+	 * Flush out any old buffers remaining from a previous use.
+	 */
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+	error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
+	VOP_UNLOCK(devvp);
+	if (error)
+		return (error);
+
+	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
+
+	/* Don't free random space on error. */
+	bp = NULL;
+	abp = NULL;
+	ump = NULL;
+
+	sb_addr = LFS_LABELPAD / DEV_BSIZE;
+	while (1) {
+		/* Read in the superblock. */
+		error = bread(devvp, sb_addr, LFS_SBPAD, cred, 0, &bp);
+		if (error)
+			goto out;
+		dfs = (struct dlfs *)bp->b_data;
+
+		/* Check the basics. */
+		if (dfs->dlfs_magic != LFS_MAGIC || dfs->dlfs_bsize > MAXBSIZE ||
+		    dfs->dlfs_version > LFS_VERSION ||
+		    dfs->dlfs_bsize < sizeof(struct dlfs)) {
+			DLOG((DLOG_MOUNT, "lfs_mountfs: primary superblock sanity failed\n"));
+			error = EINVAL;		/* XXX needs translation */
+			goto out;
+		}
+		if (dfs->dlfs_inodefmt > LFS_MAXINODEFMT) {
+			DLOG((DLOG_MOUNT, "lfs_mountfs: unknown inode format %d\n",
+			       dfs->dlfs_inodefmt));
+			error = EINVAL;
+			goto out;
+		}
+
+		if (dfs->dlfs_version == 1)
+			fsbsize = DEV_BSIZE;
+		else {
+			fsbsize = 1 << dfs->dlfs_ffshift;
+			/*
+			 * Could be, if the frag size is large enough, that we
+			 * don't have the "real" primary superblock.  If that's
+			 * the case, get the real one, and try again.
+			 */
+			if (sb_addr != (dfs->dlfs_sboffs[0] << (dfs->dlfs_ffshift - DEV_BSHIFT))) {
+				DLOG((DLOG_MOUNT, "lfs_mountfs: sb daddr"
+				      " 0x%llx is not right, trying 0x%llx\n",
+				      (long long)sb_addr,
+				      (long long)(dfs->dlfs_sboffs[0] << (dfs->dlfs_ffshift - DEV_BSHIFT))));
+				sb_addr = dfs->dlfs_sboffs[0] << (dfs->dlfs_ffshift - DEV_BSHIFT);
+				brelse(bp, 0);
+				continue;
+			}
+		}
+		break;
+	}
+
+	/*
+	 * Check the second superblock to see which is newer; then mount
+	 * using the older of the two.	This is necessary to ensure that
+	 * the filesystem is valid if it was not unmounted cleanly.
+	 */
+
+	if (dfs->dlfs_sboffs[1] &&
+	    dfs->dlfs_sboffs[1] - LFS_LABELPAD / fsbsize > LFS_SBPAD / fsbsize)
+	{
+		error = bread(devvp, dfs->dlfs_sboffs[1] * (fsbsize / DEV_BSIZE),
+			LFS_SBPAD, cred, 0, &abp);
+		if (error)
+			goto out;
+		adfs = (struct dlfs *)abp->b_data;
+
+		if (dfs->dlfs_version == 1) {
+			/* 1s resolution comparison */
+			if (adfs->dlfs_tstamp < dfs->dlfs_tstamp)
+				tdfs = adfs;
+			else
+				tdfs = dfs;
+		} else {
+			/* monotonic infinite-resolution comparison */
+			if (adfs->dlfs_serial < dfs->dlfs_serial)
+				tdfs = adfs;
+			else
+				tdfs = dfs;
+		}
+
+		/* Check the basics. */
+		if (tdfs->dlfs_magic != LFS_MAGIC ||
+		    tdfs->dlfs_bsize > MAXBSIZE ||
+		    tdfs->dlfs_version > LFS_VERSION ||
+		    tdfs->dlfs_bsize < sizeof(struct dlfs)) {
+			DLOG((DLOG_MOUNT, "lfs_mountfs: alt superblock"
+			      " sanity failed\n"));
+			error = EINVAL;		/* XXX needs translation */
+			goto out;
+		}
+	} else {
+		DLOG((DLOG_MOUNT, "lfs_mountfs: invalid alt superblock"
+		      " daddr=0x%x\n", dfs->dlfs_sboffs[1]));
+		error = EINVAL;
+		goto out;
+	}
+
+	/* Allocate the mount structure, copy the superblock into it. */
+	fs = malloc(sizeof(struct lfs), M_UFSMNT, M_WAITOK | M_ZERO);
+	memcpy(&fs->lfs_dlfs, tdfs, sizeof(struct dlfs));
+
+	/* Compatibility */
+	if (fs->lfs_version < 2) {
+		fs->lfs_sumsize = LFS_V1_SUMMARY_SIZE;
+		fs->lfs_ibsize = fs->lfs_bsize;
+		fs->lfs_start = fs->lfs_sboffs[0];
+		fs->lfs_tstamp = fs->lfs_otstamp;
+		fs->lfs_fsbtodb = 0;
+	}
+	if (fs->lfs_resvseg == 0)
+		fs->lfs_resvseg = MIN(fs->lfs_minfreeseg - 1, \
+			MAX(MIN_RESV_SEGS, fs->lfs_minfreeseg / 2 + 1));
+
+	/*
+	 * If we aren't going to be able to write meaningfully to this
+	 * filesystem, and were not mounted readonly, bomb out now.
+	 */
+	if (fsbtob(fs, LFS_NRESERVE(fs)) > LFS_MAX_BYTES && !ronly) {
+		DLOG((DLOG_MOUNT, "lfs_mount: to mount this filesystem read/write,"
+		      " we need BUFPAGES >= %lld\n",
+		      (long long)((bufmem_hiwater / bufmem_lowater) *
+				  LFS_INVERSE_MAX_BYTES(
+					  fsbtob(fs, LFS_NRESERVE(fs))) >> PAGE_SHIFT)));
+		free(fs, M_UFSMNT);
+		error = EFBIG; /* XXX needs translation */
+		goto out;
+	}
+
+	/* Before rolling forward, lock so vget will sleep for other procs */
+	if (l != NULL) {
+		fs->lfs_flags = LFS_NOTYET;
+		fs->lfs_rfpid = l->l_proc->p_pid;
+	}
+
+	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
+	ump->um_lfs = fs;
+	ump->um_ops = &lfs_ufsops;
+	ump->um_fstype = UFS1;
+	if (sizeof(struct lfs) < LFS_SBPAD) {			/* XXX why? */
+		brelse(bp, BC_INVAL);
+		brelse(abp, BC_INVAL);
+	} else {
+		brelse(bp, 0);
+		brelse(abp, 0);
+	}
+	bp = NULL;
+	abp = NULL;
+
+
+	/* Set up the I/O information */
+	fs->lfs_devbsize = DEV_BSIZE;
+	fs->lfs_iocount = 0;
+	fs->lfs_diropwait = 0;
+	fs->lfs_activesb = 0;
+	fs->lfs_uinodes = 0;
+	fs->lfs_ravail = 0;
+	fs->lfs_favail = 0;
+	fs->lfs_sbactive = 0;
+
+	/* Set up the ifile and lock aflags */
+	fs->lfs_doifile = 0;
+	fs->lfs_writer = 0;
+	fs->lfs_dirops = 0;
+	fs->lfs_nadirop = 0;
+	fs->lfs_seglock = 0;
+	fs->lfs_pdflush = 0;
+	fs->lfs_sleepers = 0;
+	fs->lfs_pages = 0;
+	rw_init(&fs->lfs_fraglock);
+	rw_init(&fs->lfs_iflock);
+	cv_init(&fs->lfs_stopcv, "lfsstop");
+
+	/* Set the file system readonly/modify bits. */
+	fs->lfs_ronly = ronly;
+	if (ronly == 0)
+		fs->lfs_fmod = 1;
+
+	/* Initialize the mount structure. */
+	dev = devvp->v_rdev;
+	mp->mnt_data = ump;
+	mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
+	mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_LFS);
+	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
+	mp->mnt_stat.f_namemax = LFS_MAXNAMLEN;
+	mp->mnt_stat.f_iosize = fs->lfs_bsize;
+	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_fs_bshift = fs->lfs_bshift;
+	ump->um_flags = 0;
+	ump->um_mountp = mp;
+	ump->um_dev = dev;
+	ump->um_devvp = devvp;
+	ump->um_bptrtodb = fs->lfs_ffshift - DEV_BSHIFT;
+	ump->um_seqinc = fs->lfs_frag;
+	ump->um_nindir = fs->lfs_nindir;
+	ump->um_lognindir = ffs(fs->lfs_nindir) - 1;
+	for (i = 0; i < MAXQUOTAS; i++)
+		ump->um_quotas[i] = NULLVP;
+	ump->um_maxsymlinklen = fs->lfs_maxsymlinklen;
+	ump->um_dirblksiz = DIRBLKSIZ;
+	ump->um_maxfilesize = fs->lfs_maxfilesize;
+	if (ump->um_maxsymlinklen > 0)
+		mp->mnt_iflag |= IMNT_DTYPE;
+	devvp->v_specmountpoint = mp;
+
+	/* Set up reserved memory for pageout */
+	lfs_setup_resblks(fs);
+	/* Set up vdirop tailq */
+	TAILQ_INIT(&fs->lfs_dchainhd);
+	/* and paging tailq */
+	TAILQ_INIT(&fs->lfs_pchainhd);
+	/* and delayed segment accounting for truncation list */
+	LIST_INIT(&fs->lfs_segdhd);
+
+	/*
+	 * We use the ifile vnode for almost every operation.  Instead of
+	 * retrieving it from the hash table each time we retrieve it here,
+	 * artificially increment the reference count and keep a pointer
+	 * to it in the incore copy of the superblock.
+	 */
+	if ((error = VFS_VGET(mp, LFS_IFILE_INUM, &vp)) != 0) {
+		DLOG((DLOG_MOUNT, "lfs_mountfs: ifile vget failed, error=%d\n", error));
+		goto out;
+	}
+	fs->lfs_ivnode = vp;
+	vref(vp);
+
+	/* Set up inode bitmap and order free list */
+	lfs_order_freelist(fs);
+
+	/* Set up segment usage flags for the autocleaner. */
+	fs->lfs_nactive = 0;
+	fs->lfs_suflags = (u_int32_t **)malloc(2 * sizeof(u_int32_t *),
+						M_SEGMENT, M_WAITOK);
+	fs->lfs_suflags[0] = (u_int32_t *)malloc(fs->lfs_nseg * sizeof(u_int32_t),
+						 M_SEGMENT, M_WAITOK);
+	fs->lfs_suflags[1] = (u_int32_t *)malloc(fs->lfs_nseg * sizeof(u_int32_t),
+						 M_SEGMENT, M_WAITOK);
+	memset(fs->lfs_suflags[1], 0, fs->lfs_nseg * sizeof(u_int32_t));
+	for (i = 0; i < fs->lfs_nseg; i++) {
+		int changed;
+
+		LFS_SEGENTRY(sup, fs, i, bp);
+		changed = 0;
+		if (!ronly) {
+			if (sup->su_nbytes == 0 &&
+			    !(sup->su_flags & SEGUSE_EMPTY)) {
+				sup->su_flags |= SEGUSE_EMPTY;
+				++changed;
+			} else if (!(sup->su_nbytes == 0) &&
+				   (sup->su_flags & SEGUSE_EMPTY)) {
+				sup->su_flags &= ~SEGUSE_EMPTY;
+				++changed;
+			}
+			if (sup->su_flags & (SEGUSE_ACTIVE|SEGUSE_INVAL)) {
+				sup->su_flags &= ~(SEGUSE_ACTIVE|SEGUSE_INVAL);
+				++changed;
+			}
+		}
+		fs->lfs_suflags[0][i] = sup->su_flags;
+		if (changed)
+			LFS_WRITESEGENTRY(sup, fs, i, bp);
+		else
+			brelse(bp, 0);
+	}
+
+#ifdef LFS_KERNEL_RFW
+	lfs_roll_forward(fs, mp, l);
+#endif
+
+	/* If writing, sb is not clean; record in case of immediate crash */
+	if (!fs->lfs_ronly) {
+		fs->lfs_pflags &= ~LFS_PF_CLEAN;
+		lfs_writesuper(fs, fs->lfs_sboffs[0]);
+		lfs_writesuper(fs, fs->lfs_sboffs[1]);
+	}
+
+	/* Allow vget now that roll-forward is complete */
+	fs->lfs_flags &= ~(LFS_NOTYET);
+	wakeup(&fs->lfs_flags);
+
+	/*
+	 * Initialize the ifile cleaner info with information from
+	 * the superblock.
+	 */
+	LFS_CLEANERINFO(cip, fs, bp);
+	cip->clean = fs->lfs_nclean;
+	cip->dirty = fs->lfs_nseg - fs->lfs_nclean;
+	cip->avail = fs->lfs_avail;
+	cip->bfree = fs->lfs_bfree;
+	(void) LFS_BWRITE_LOG(bp); /* Ifile */
+
+	/*
+	 * Mark the current segment as ACTIVE, since we're going to
+	 * be writing to it.
+	 */
+	LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp);
+	sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
+	fs->lfs_nactive++;
+	LFS_WRITESEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp);  /* Ifile */
+
+	/* Now that roll-forward is done, unlock the Ifile */
+	vput(vp);
+
+	/* Start the pagedaemon-anticipating daemon */
+	if (lfs_writer_daemon == 0 && kthread_create(PRI_BIO, 0, NULL,
+	    lfs_writerd, NULL, NULL, "lfs_writer") != 0)
+		panic("fork lfs_writer");
+	/*
+	 * XXX: Get extra reference to LFS vfsops.  This prevents unload,
+	 * but also prevents kernel panic due to text being unloaded
+	 * from below lfs_writerd.  When lfs_writerd can exit, remove
+	 * this!!!
+	 */
+	vfs_getopsbyname(MOUNT_LFS);
+
+	printf("WARNING: the log-structured file system is experimental\n"
+	    "WARNING: it may cause system crashes and/or corrupt data\n");
+
+	return (0);
+
+out:
+	if (bp)
+		brelse(bp, 0);
+	if (abp)
+		brelse(abp, 0);
+	if (ump) {
+		free(ump->um_lfs, M_UFSMNT);
+		free(ump, M_UFSMNT);
+		mp->mnt_data = NULL;
+	}
+
+	return (error);
+}
+
+/*
+ * unmount system call
+ */
+int
+lfs_unmount(struct mount *mp, int mntflags)
+{
+	struct lwp *l = curlwp;
+	struct ufsmount *ump;
+	struct lfs *fs;
+	int error, flags, ronly;
+	vnode_t *vp;
+
+	flags = 0;
+	if (mntflags & MNT_FORCE)
+		flags |= FORCECLOSE;
+
+	ump = VFSTOUFS(mp);
+	fs = ump->um_lfs;
+
+	/* Two checkpoints */
+	lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
+	lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
+
+	/* wake up the cleaner so it can die */
+	lfs_wakeup_cleaner(fs);
+	mutex_enter(&lfs_lock);
+	while (fs->lfs_sleepers)
+		mtsleep(&fs->lfs_sleepers, PRIBIO + 1, "lfs_sleepers", 0,
+			&lfs_lock);
+	mutex_exit(&lfs_lock);
+
+#ifdef QUOTA
+        if ((error = quota1_umount(mp, flags)) != 0)
+		return (error);
+#endif
+	if ((error = vflush(mp, fs->lfs_ivnode, flags)) != 0)
+		return (error);
+	if ((error = VFS_SYNC(mp, 1, l->l_cred)) != 0)
+		return (error);
+	vp = fs->lfs_ivnode;
+	mutex_enter(vp->v_interlock);
+	if (LIST_FIRST(&vp->v_dirtyblkhd))
+		panic("lfs_unmount: still dirty blocks on ifile vnode");
+	mutex_exit(vp->v_interlock);
+
+	/* Explicitly write the superblock, to update serial and pflags */
+	fs->lfs_pflags |= LFS_PF_CLEAN;
+	lfs_writesuper(fs, fs->lfs_sboffs[0]);
+	lfs_writesuper(fs, fs->lfs_sboffs[1]);
+	mutex_enter(&lfs_lock);
+	while (fs->lfs_iocount)
+		mtsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs_umount", 0,
+			&lfs_lock);
+	mutex_exit(&lfs_lock);
+
+	/* Finish with the Ifile, now that we're done with it */
+	vgone(fs->lfs_ivnode);
+
+	ronly = !fs->lfs_ronly;
+	if (ump->um_devvp->v_type != VBAD)
+		ump->um_devvp->v_specmountpoint = NULL;
+	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+	error = VOP_CLOSE(ump->um_devvp,
+	    ronly ? FREAD : FREAD|FWRITE, NOCRED);
+	vput(ump->um_devvp);
+
+	/* Complain about page leakage */
+	if (fs->lfs_pages > 0)
+		printf("lfs_unmount: still claim %d pages (%d in subsystem)\n",
+			fs->lfs_pages, lfs_subsys_pages);
+
+	/* Free per-mount data structures */
+	free(fs->lfs_ino_bitmap, M_SEGMENT);
+	free(fs->lfs_suflags[0], M_SEGMENT);
+	free(fs->lfs_suflags[1], M_SEGMENT);
+	free(fs->lfs_suflags, M_SEGMENT);
+	lfs_free_resblks(fs);
+	cv_destroy(&fs->lfs_stopcv);
+	rw_destroy(&fs->lfs_fraglock);
+	rw_destroy(&fs->lfs_iflock);
+	free(fs, M_UFSMNT);
+	free(ump, M_UFSMNT);
+
+	mp->mnt_data = NULL;
+	mp->mnt_flag &= ~MNT_LOCAL;
+	return (error);
+}
+
+/*
+ * Get file system statistics.
+ *
+ * NB: We don't lock to access the superblock here, because it's not
+ * really that important if we get it wrong.
+ */
+int
+lfs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+	struct lfs *fs;
+	struct ufsmount *ump;
+
+	ump = VFSTOUFS(mp);
+	fs = ump->um_lfs;
+	if (fs->lfs_magic != LFS_MAGIC)
+		panic("lfs_statvfs: magic");
+
+	sbp->f_bsize = fs->lfs_bsize;
+	sbp->f_frsize = fs->lfs_fsize;
+	sbp->f_iosize = fs->lfs_bsize;
+	sbp->f_blocks = LFS_EST_NONMETA(fs) - VTOI(fs->lfs_ivnode)->i_lfs_effnblks;
+
+	sbp->f_bfree = LFS_EST_BFREE(fs);
+	KASSERT(sbp->f_bfree <= fs->lfs_dsize);
+#if 0
+	if (sbp->f_bfree < 0)
+		sbp->f_bfree = 0;
+#endif
+
+	sbp->f_bresvd = LFS_EST_RSVD(fs);
+	if (sbp->f_bfree > sbp->f_bresvd)
+		sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd;
+	else
+		sbp->f_bavail = 0;
+
+	sbp->f_files = fs->lfs_bfree / btofsb(fs, fs->lfs_ibsize) * INOPB(fs);
+	sbp->f_ffree = sbp->f_files - fs->lfs_nfiles;
+	sbp->f_favail = sbp->f_ffree;
+	sbp->f_fresvd = 0;
+	copy_statvfs_info(sbp, mp);
+	return (0);
+}
+
+/*
+ * Go through the disk queues to initiate sandbagged IO;
+ * go through the inodes to write those that have been modified;
+ * initiate the writing of the super block if it has been modified.
+ *
+ * Note: we are always called with the filesystem marked `MPBUSY'.
+ */
+int
+lfs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
+{
+	int error;
+	struct lfs *fs;
+
+	fs = VFSTOUFS(mp)->um_lfs;
+	if (fs->lfs_ronly)
+		return 0;
+
+	/* Snapshots should not hose the syncer */
+	/*
+	 * XXX Sync can block here anyway, since we don't have a very
+	 * XXX good idea of how much data is pending.  If it's more
+	 * XXX than a segment and lfs_nextseg is close to the end of
+	 * XXX the log, we'll likely block.
+	 */
+	mutex_enter(&lfs_lock);
+	if (fs->lfs_nowrap && fs->lfs_nextseg < fs->lfs_curseg) {
+		mutex_exit(&lfs_lock);
+		return 0;
+	}
+	mutex_exit(&lfs_lock);
+
+	lfs_writer_enter(fs, "lfs_dirops");
+
+	/* All syncs must be checkpoints until roll-forward is implemented. */
+	DLOG((DLOG_FLUSH, "lfs_sync at 0x%x\n", fs->lfs_offset));
+	error = lfs_segwrite(mp, SEGM_CKP | (waitfor ? SEGM_SYNC : 0));
+	lfs_writer_leave(fs);
+#ifdef QUOTA
+	qsync(mp);
+#endif
+	return (error);
+}
+
+/*
+ * Look up an LFS dinode number to find its incore vnode.  If not already
+ * in core, read it in from the specified device.  Return the inode locked.
+ * Detection and handling of mount points must be done by the calling routine.
+ */
+int
+lfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
+{
+	struct lfs *fs;
+	struct ufs1_dinode *dip;
+	struct inode *ip;
+	struct buf *bp;
+	struct ifile *ifp;
+	struct vnode *vp;
+	struct ufsmount *ump;
+	daddr_t daddr;
+	dev_t dev;
+	int error, retries;
+	struct timespec ts;
+
+	memset(&ts, 0, sizeof ts);	/* XXX gcc */
+
+	ump = VFSTOUFS(mp);
+	dev = ump->um_dev;
+	fs = ump->um_lfs;
+
+	/*
+	 * If the filesystem is not completely mounted yet, suspend
+	 * any access requests (wait for roll-forward to complete).
+	 */
+	mutex_enter(&lfs_lock);
+	while ((fs->lfs_flags & LFS_NOTYET) && curproc->p_pid != fs->lfs_rfpid)
+		mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_notyet", 0,
+			&lfs_lock);
+	mutex_exit(&lfs_lock);
+
+retry:
+	if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL)
+		return (0);
+
+	error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, NULL, &vp);
+	if (error) {
+		*vpp = NULL;
+		 return (error);
+	}
+
+	mutex_enter(&ufs_hashlock);
+	if (ufs_ihashget(dev, ino, 0) != NULL) {
+		mutex_exit(&ufs_hashlock);
+		ungetnewvnode(vp);
+		goto retry;
+	}
+
+	/* Translate the inode number to a disk address. */
+	if (ino == LFS_IFILE_INUM)
+		daddr = fs->lfs_idaddr;
+	else {
+		/* XXX bounds-check this too */
+		LFS_IENTRY(ifp, fs, ino, bp);
+		daddr = ifp->if_daddr;
+		if (fs->lfs_version > 1) {
+			ts.tv_sec = ifp->if_atime_sec;
+			ts.tv_nsec = ifp->if_atime_nsec;
+		}
+
+		brelse(bp, 0);
+		if (daddr == LFS_UNUSED_DADDR) {
+			*vpp = NULLVP;
+			mutex_exit(&ufs_hashlock);
+			ungetnewvnode(vp);
+			return (ENOENT);
+		}
+	}
+
+	/* Allocate/init new vnode/inode. */
+	lfs_vcreate(mp, ino, vp);
+
+	/*
+	 * Put it onto its hash chain and lock it so that other requests for
+	 * this inode will block if they arrive while we are sleeping waiting
+	 * for old data structures to be purged or for the contents of the
+	 * disk portion of this inode to be read.
+	 */
+	ip = VTOI(vp);
+	ufs_ihashins(ip);
+	mutex_exit(&ufs_hashlock);
+
+	/*
+	 * XXX
+	 * This may not need to be here, logically it should go down with
+	 * the i_devvp initialization.
+	 * Ask Kirk.
+	 */
+	ip->i_lfs = ump->um_lfs;
+
+	/* Read in the disk contents for the inode, copy into the inode. */
+	retries = 0;
+    again:
+	error = bread(ump->um_devvp, fsbtodb(fs, daddr),
+		(fs->lfs_version == 1 ? fs->lfs_bsize : fs->lfs_ibsize),
+		NOCRED, 0, &bp);
+	if (error) {
+		/*
+		 * The inode does not contain anything useful, so it would
+		 * be misleading to leave it on its hash chain. With mode
+		 * still zero, it will be unlinked and returned to the free
+		 * list by vput().
+		 */
+		vput(vp);
+		brelse(bp, 0);
+		*vpp = NULL;
+		return (error);
+	}
+
+	dip = lfs_ifind(fs, ino, bp);
+	if (dip == NULL) {
+		/* Assume write has not completed yet; try again */
+		brelse(bp, BC_INVAL);
+		++retries;
+		if (retries > LFS_IFIND_RETRIES) {
+#ifdef DEBUG
+			/* If the seglock is held look at the bpp to see
+			   what is there anyway */
+			mutex_enter(&lfs_lock);
+			if (fs->lfs_seglock > 0) {
+				struct buf **bpp;
+				struct ufs1_dinode *dp;
+				int i;
+
+				for (bpp = fs->lfs_sp->bpp;
+				     bpp != fs->lfs_sp->cbpp; ++bpp) {
+					if ((*bpp)->b_vp == fs->lfs_ivnode &&
+					    bpp != fs->lfs_sp->bpp) {
+						/* Inode block */
+						printf("lfs_vget: block 0x%" PRIx64 ": ",
+						       (*bpp)->b_blkno);
+						dp = (struct ufs1_dinode *)(*bpp)->b_data;
+						for (i = 0; i < INOPB(fs); i++)
+							if (dp[i].di_u.inumber)
+								printf("%d ", dp[i].di_u.inumber);
+						printf("\n");
+					}
+				}
+			}
+			mutex_exit(&lfs_lock);
+#endif /* DEBUG */
+			panic("lfs_vget: dinode not found");
+		}
+		mutex_enter(&lfs_lock);
+		if (fs->lfs_iocount) {
+			DLOG((DLOG_VNODE, "lfs_vget: dinode %d not found, retrying...\n", ino));
+			(void)mtsleep(&fs->lfs_iocount, PRIBIO + 1,
+				      "lfs ifind", 1, &lfs_lock);
+		} else
+			retries = LFS_IFIND_RETRIES;
+		mutex_exit(&lfs_lock);
+		goto again;
+	}
+	*ip->i_din.ffs1_din = *dip;
+	brelse(bp, 0);
+
+	if (fs->lfs_version > 1) {
+		ip->i_ffs1_atime = ts.tv_sec;
+		ip->i_ffs1_atimensec = ts.tv_nsec;
+	}
+
+	lfs_vinit(mp, &vp);
+
+	*vpp = vp;
+
+	KASSERT(VOP_ISLOCKED(vp));
+
+	return (0);
+}
+
+/*
+ * File handle to vnode
+ */
+int
+lfs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
+{
+	struct lfid lfh;
+	struct buf *bp;
+	IFILE *ifp;
+	int32_t daddr;
+	struct lfs *fs;
+	vnode_t *vp;
+
+	if (fhp->fid_len != sizeof(struct lfid))
+		return EINVAL;
+
+	memcpy(&lfh, fhp, sizeof(lfh));
+	if (lfh.lfid_ino < LFS_IFILE_INUM)
+		return ESTALE;
+
+	fs = VFSTOUFS(mp)->um_lfs;
+	if (lfh.lfid_ident != fs->lfs_ident)
+		return ESTALE;
+
+	if (lfh.lfid_ino >
+	    ((VTOI(fs->lfs_ivnode)->i_ffs1_size >> fs->lfs_bshift) -
+	     fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb)
+		return ESTALE;
+
+	mutex_enter(&ufs_ihash_lock);
+	vp = ufs_ihashlookup(VFSTOUFS(mp)->um_dev, lfh.lfid_ino);
+	mutex_exit(&ufs_ihash_lock);
+	if (vp == NULL) {
+		LFS_IENTRY(ifp, fs, lfh.lfid_ino, bp);
+		daddr = ifp->if_daddr;
+		brelse(bp, 0);
+		if (daddr == LFS_UNUSED_DADDR)
+			return ESTALE;
+	}
+
+	return (ufs_fhtovp(mp, &lfh.lfid_ufid, vpp));
+}
+
+/*
+ * Vnode pointer to File handle
+ */
+/* ARGSUSED */
+int
+lfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
+{
+	struct inode *ip;
+	struct lfid lfh;
+
+	if (*fh_size < sizeof(struct lfid)) {
+		*fh_size = sizeof(struct lfid);
+		return E2BIG;
+	}
+	*fh_size = sizeof(struct lfid);
+	ip = VTOI(vp);
+	memset(&lfh, 0, sizeof(lfh));
+	lfh.lfid_len = sizeof(struct lfid);
+	lfh.lfid_ino = ip->i_number;
+	lfh.lfid_gen = ip->i_gen;
+	lfh.lfid_ident = ip->i_lfs->lfs_ident;
+	memcpy(fhp, &lfh, sizeof(lfh));
+	return (0);
+}
+
+/*
+ * ufs_bmaparray callback function for writing.
+ *
+ * Since blocks will be written to the new segment anyway,
+ * we don't care about current daddr of them.
+ */
+static bool
+lfs_issequential_hole(const struct ufsmount *ump,
+    daddr_t daddr0, daddr_t daddr1)
+{
+	daddr0 = (daddr_t)((int32_t)daddr0); /* XXX ondisk32 */
+	daddr1 = (daddr_t)((int32_t)daddr1); /* XXX ondisk32 */
+
+	KASSERT(daddr0 == UNWRITTEN ||
+	    (0 <= daddr0 && daddr0 <= LFS_MAX_DADDR));
+	KASSERT(daddr1 == UNWRITTEN ||
+	    (0 <= daddr1 && daddr1 <= LFS_MAX_DADDR));
+
+	/* NOTE: all we want to know here is 'hole or not'. */
+	/* NOTE: UNASSIGNED is converted to 0 by ufs_bmaparray. */
+
+	/*
+	 * treat UNWRITTENs and all resident blocks as 'contiguous'
+	 */
+	if (daddr0 != 0 && daddr1 != 0)
+		return true;
+
+	/*
+	 * both are in hole?
+	 */
+	if (daddr0 == 0 && daddr1 == 0)
+		return true; /* all holes are 'contiguous' for us. */
+
+	return false;
+}
+
+/*
+ * lfs_gop_write functions exactly like genfs_gop_write, except that
+ * (1) it requires the seglock to be held by its caller, and sp->fip
+ *     to be properly initialized (it will return without re-initializing
+ *     sp->fip, and without calling lfs_writeseg).
+ * (2) it uses the remaining space in the segment, rather than VOP_BMAP,
+ *     to determine how large a block it can write at once (though it does
+ *     still use VOP_BMAP to find holes in the file);
+ * (3) it calls lfs_gatherblock instead of VOP_STRATEGY on its blocks
+ *     (leaving lfs_writeseg to deal with the cluster blocks, so we might
+ *     now have clusters of clusters, ick.)
+ */
+static int
+lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
+    int flags)
+{
+	int i, error, run, haveeof = 0;
+	int fs_bshift;
+	vaddr_t kva;
+	off_t eof, offset, startoffset = 0;
+	size_t bytes, iobytes, skipbytes;
+	bool async = (flags & PGO_SYNCIO) == 0;
+	daddr_t lbn, blkno;
+	struct vm_page *pg;
+	struct buf *mbp, *bp;
+	struct vnode *devvp = VTOI(vp)->i_devvp;
+	struct inode *ip = VTOI(vp);
+	struct lfs *fs = ip->i_lfs;
+	struct segment *sp = fs->lfs_sp;
+	UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist);
+
+	ASSERT_SEGLOCK(fs);
+
+	/* The Ifile lives in the buffer cache */
+	KASSERT(vp != fs->lfs_ivnode);
+
+	/*
+	 * We don't want to fill the disk before the cleaner has a chance
+	 * to make room for us.  If we're in danger of doing that, fail
+	 * with EAGAIN.  The caller will have to notice this, unlock
+	 * so the cleaner can run, relock and try again.
+	 *
+	 * We must write everything, however, if our vnode is being
+	 * reclaimed.
+	 */
+	if (LFS_STARVED_FOR_SEGS(fs) && vp != fs->lfs_flushvp)
+		goto tryagain;
+
+	/*
+	 * Sometimes things slip past the filters in lfs_putpages,
+	 * and the pagedaemon tries to write pages---problem is
+	 * that the pagedaemon never acquires the segment lock.
+	 *
+	 * Alternatively, pages that were clean when we called
+	 * genfs_putpages may have become dirty in the meantime.  In this
+	 * case the segment header is not properly set up for blocks
+	 * to be added to it.
+	 *
+	 * Unbusy and unclean the pages, and put them on the ACTIVE
+	 * queue under the hypothesis that they couldn't have got here
+	 * unless they were modified *quite* recently.
+	 *
+	 * XXXUBC that last statement is an oversimplification of course.
+	 */
+	if (!LFS_SEGLOCK_HELD(fs) ||
+	    (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) ||
+	    (pgs[0]->offset & fs->lfs_bmask) != 0) {
+		goto tryagain;
+	}
+
+	UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x",
+	    vp, pgs, npages, flags);
+
+	GOP_SIZE(vp, vp->v_size, &eof, 0);
+	haveeof = 1;
+
+	if (vp->v_type == VREG)
+		fs_bshift = vp->v_mount->mnt_fs_bshift;
+	else
+		fs_bshift = DEV_BSHIFT;
+	error = 0;
+	pg = pgs[0];
+	startoffset = pg->offset;
+	KASSERT(eof >= 0);
+
+	if (startoffset >= eof) {
+		goto tryagain;
+	} else
+		bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
+	skipbytes = 0;
+
+	KASSERT(bytes != 0);
+
+	/* Swap PG_DELWRI for PG_PAGEOUT */
+	for (i = 0; i < npages; i++) {
+		if (pgs[i]->flags & PG_DELWRI) {
+			KASSERT(!(pgs[i]->flags & PG_PAGEOUT));
+			pgs[i]->flags &= ~PG_DELWRI;
+			pgs[i]->flags |= PG_PAGEOUT;
+			uvm_pageout_start(1);
+			mutex_enter(&uvm_pageqlock);
+			uvm_pageunwire(pgs[i]);
+			mutex_exit(&uvm_pageqlock);
+		}
+	}
+
+	/*
+	 * Check to make sure we're starting on a block boundary.
+	 * We'll check later to make sure we always write entire
+	 * blocks (or fragments).
+	 */
+	if (startoffset & fs->lfs_bmask)
+		printf("%" PRId64 " & %" PRId64 " = %" PRId64 "\n",
+		       startoffset, fs->lfs_bmask,
+		       startoffset & fs->lfs_bmask);
+	KASSERT((startoffset & fs->lfs_bmask) == 0);
+	if (bytes & fs->lfs_ffmask) {
+		printf("lfs_gop_write: asked to write %ld bytes\n", (long)bytes);
+		panic("lfs_gop_write: non-integer blocks");
+	}
+
+	/*
+	 * We could deadlock here on pager_map with UVMPAGER_MAPIN_WAITOK.
+	 * If we would, write what we have and try again.  If we don't
+	 * have anything to write, we'll have to sleep.
+	 */
+	if ((kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
+				      (((SEGSUM *)(sp->segsum))->ss_nfinfo < 1 ?
+				       UVMPAGER_MAPIN_WAITOK : 0))) == 0x0) {
+		DLOG((DLOG_PAGE, "lfs_gop_write: forcing write\n"));
+#if 0
+		      " with nfinfo=%d at offset 0x%x\n",
+		      (int)((SEGSUM *)(sp->segsum))->ss_nfinfo,
+		      (unsigned)fs->lfs_offset));
+#endif
+		lfs_updatemeta(sp);
+		lfs_release_finfo(fs);
+		(void) lfs_writeseg(fs, sp);
+
+		lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
+
+		/*
+		 * Having given up all of the pager_map we were holding,
+		 * we can now wait for aiodoned to reclaim it for us
+		 * without fear of deadlock.
+		 */
+		kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
+				     UVMPAGER_MAPIN_WAITOK);
+	}
+
+	mbp = getiobuf(NULL, true);
+	UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x",
+	    vp, mbp, vp->v_numoutput, bytes);
+	mbp->b_bufsize = npages << PAGE_SHIFT;
+	mbp->b_data = (void *)kva;
+	mbp->b_resid = mbp->b_bcount = bytes;
+	mbp->b_cflags = BC_BUSY|BC_AGE;
+	mbp->b_iodone = uvm_aio_biodone;
+
+	bp = NULL;
+	for (offset = startoffset;
+	    bytes > 0;
+	    offset += iobytes, bytes -= iobytes) {
+		lbn = offset >> fs_bshift;
+		error = ufs_bmaparray(vp, lbn, &blkno, NULL, NULL, &run,
+		    lfs_issequential_hole);
+		if (error) {
+			UVMHIST_LOG(ubchist, "ufs_bmaparray() -> %d",
+			    error,0,0,0);
+			skipbytes += bytes;
+			bytes = 0;
+			break;
+		}
+
+		iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
+		    bytes);
+		if (blkno == (daddr_t)-1) {
+			skipbytes += iobytes;
+			continue;
+		}
+
+		/*
+		 * Discover how much we can really pack into this buffer.
+		 */
+		/* If no room in the current segment, finish it up */
+		if (sp->sum_bytes_left < sizeof(int32_t) ||
+		    sp->seg_bytes_left < (1 << fs->lfs_bshift)) {
+			int vers;
+
+			lfs_updatemeta(sp);
+			vers = sp->fip->fi_version;
+			lfs_release_finfo(fs);
+			(void) lfs_writeseg(fs, sp);
+
+			lfs_acquire_finfo(fs, ip->i_number, vers);
+		}
+		/* Check both for space in segment and space in segsum */
+		iobytes = MIN(iobytes, (sp->seg_bytes_left >> fs_bshift)
+					<< fs_bshift);
+		iobytes = MIN(iobytes, (sp->sum_bytes_left / sizeof(int32_t))
+				       << fs_bshift);
+		KASSERT(iobytes > 0);
+
+		/* if it's really one i/o, don't make a second buf */
+		if (offset == startoffset && iobytes == bytes) {
+			bp = mbp;
+			/* 
+			 * All the LFS output is done by the segwriter.  It
+			 * will increment numoutput by one for all the bufs it
+			 * recieves.  However this buffer needs one extra to
+			 * account for aiodone.
+			 */
+			mutex_enter(vp->v_interlock);
+			vp->v_numoutput++;
+			mutex_exit(vp->v_interlock);
+		} else {
+			bp = getiobuf(NULL, true);
+			UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
+			    vp, bp, vp->v_numoutput, 0);
+			nestiobuf_setup(mbp, bp, offset - pg->offset, iobytes);
+			/*
+			 * LFS doesn't like async I/O here, dies with
+			 * and assert in lfs_bwrite().  Is that assert
+			 * valid?  I retained non-async behaviour when
+			 * converted this to use nestiobuf --pooka
+			 */
+			bp->b_flags &= ~B_ASYNC;
+		}
+
+		/* XXX This is silly ... is this necessary? */
+		mutex_enter(&bufcache_lock);
+		mutex_enter(vp->v_interlock);
+		bgetvp(vp, bp);
+		mutex_exit(vp->v_interlock);
+		mutex_exit(&bufcache_lock);
+
+		bp->b_lblkno = lblkno(fs, offset);
+		bp->b_private = mbp;
+		if (devvp->v_type == VBLK) {
+			bp->b_dev = devvp->v_rdev;
+		}
+		VOP_BWRITE(bp->b_vp, bp);
+		while (lfs_gatherblock(sp, bp, NULL))
+			continue;
+	}
+
+	nestiobuf_done(mbp, skipbytes, error);
+	if (skipbytes) {
+		UVMHIST_LOG(ubchist, "skipbytes %d", skipbytes, 0,0,0);
+	}
+	UVMHIST_LOG(ubchist, "returning 0", 0,0,0,0);
+
+	if (!async) {
+		/* Start a segment write. */
+		UVMHIST_LOG(ubchist, "flushing", 0,0,0,0);
+		mutex_enter(&lfs_lock);
+		lfs_flush(fs, 0, 1);
+		mutex_exit(&lfs_lock);
+	}
+	return (0);
+
+    tryagain:
+	/*
+	 * We can't write the pages, for whatever reason.
+	 * Clean up after ourselves, and make the caller try again.
+	 */
+	mutex_enter(vp->v_interlock);
+
+	/* Tell why we're here, if we know */
+	if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) {
+		DLOG((DLOG_PAGE, "lfs_gop_write: clean pages dirtied\n"));
+	} else if ((pgs[0]->offset & fs->lfs_bmask) != 0) {
+		DLOG((DLOG_PAGE, "lfs_gop_write: not on block boundary\n"));
+	} else if (haveeof && startoffset >= eof) {
+		DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64
+		      " eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number,
+		      pgs[0]->offset, eof, npages));
+	} else if (LFS_STARVED_FOR_SEGS(fs)) {
+		DLOG((DLOG_PAGE, "lfs_gop_write: avail too low\n"));
+	} else {
+		DLOG((DLOG_PAGE, "lfs_gop_write: seglock not held\n"));
+	}
+
+	mutex_enter(&uvm_pageqlock);
+	for (i = 0; i < npages; i++) {
+		pg = pgs[i];
+
+		if (pg->flags & PG_PAGEOUT)
+			uvm_pageout_done(1);
+		if (pg->flags & PG_DELWRI) {
+			uvm_pageunwire(pg);
+		}
+		uvm_pageactivate(pg);
+		pg->flags &= ~(PG_CLEAN|PG_DELWRI|PG_PAGEOUT|PG_RELEASED);
+		DLOG((DLOG_PAGE, "pg[%d] = %p (vp %p off %" PRIx64 ")\n", i, pg,
+			vp, pg->offset));
+		DLOG((DLOG_PAGE, "pg[%d]->flags = %x\n", i, pg->flags));
+		DLOG((DLOG_PAGE, "pg[%d]->pqflags = %x\n", i, pg->pqflags));
+		DLOG((DLOG_PAGE, "pg[%d]->uanon = %p\n", i, pg->uanon));
+		DLOG((DLOG_PAGE, "pg[%d]->uobject = %p\n", i, pg->uobject));
+		DLOG((DLOG_PAGE, "pg[%d]->wire_count = %d\n", i,
+		      pg->wire_count));
+		DLOG((DLOG_PAGE, "pg[%d]->loan_count = %d\n", i,
+		      pg->loan_count));
+	}
+	/* uvm_pageunbusy takes care of PG_BUSY, PG_WANTED */
+	uvm_page_unbusy(pgs, npages);
+	mutex_exit(&uvm_pageqlock);
+	mutex_exit(vp->v_interlock);
+	return EAGAIN;
+}
+
+/*
+ * finish vnode/inode initialization.
+ * used by lfs_vget and lfs_fastvget.
+ */
+void
+lfs_vinit(struct mount *mp, struct vnode **vpp)
+{
+	struct vnode *vp = *vpp;
+	struct inode *ip = VTOI(vp);
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct lfs *fs = ump->um_lfs;
+	int i;
+
+	ip->i_mode = ip->i_ffs1_mode;
+	ip->i_nlink = ip->i_ffs1_nlink;
+	ip->i_lfs_osize = ip->i_size = ip->i_ffs1_size;
+	ip->i_flags = ip->i_ffs1_flags;
+	ip->i_gen = ip->i_ffs1_gen;
+	ip->i_uid = ip->i_ffs1_uid;
+	ip->i_gid = ip->i_ffs1_gid;
+
+	ip->i_lfs_effnblks = ip->i_ffs1_blocks;
+	ip->i_lfs_odnlink = ip->i_ffs1_nlink;
+
+	/*
+	 * Initialize the vnode from the inode, check for aliases.  In all
+	 * cases re-init ip, the underlying vnode/inode may have changed.
+	 */
+	ufs_vinit(mp, lfs_specop_p, lfs_fifoop_p, &vp);
+	ip = VTOI(vp);
+
+	memset(ip->i_lfs_fragsize, 0, NDADDR * sizeof(*ip->i_lfs_fragsize));
+	if (vp->v_type != VLNK || ip->i_size >= ip->i_ump->um_maxsymlinklen) {
+#ifdef DEBUG
+		for (i = (ip->i_size + fs->lfs_bsize - 1) >> fs->lfs_bshift;
+		    i < NDADDR; i++) {
+			if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
+			    i == 0)
+				continue;
+			if (ip->i_ffs1_db[i] != 0) {
+inconsistent:
+				lfs_dump_dinode(ip->i_din.ffs1_din);
+				panic("inconsistent inode");
+			}
+		}
+		for ( ; i < NDADDR + NIADDR; i++) {
+			if (ip->i_ffs1_ib[i - NDADDR] != 0) {
+				goto inconsistent;
+			}
+		}
+#endif /* DEBUG */
+		for (i = 0; i < NDADDR; i++)
+			if (ip->i_ffs1_db[i] != 0)
+				ip->i_lfs_fragsize[i] = blksize(fs, ip, i);
+	}
+
+#ifdef DIAGNOSTIC
+	if (vp->v_type == VNON) {
+# ifdef DEBUG
+		lfs_dump_dinode(ip->i_din.ffs1_din);
+# endif
+		panic("lfs_vinit: ino %llu is type VNON! (ifmt=%o)\n",
+		      (unsigned long long)ip->i_number,
+		      (ip->i_mode & IFMT) >> 12);
+	}
+#endif /* DIAGNOSTIC */
+
+	/*
+	 * Finish inode initialization now that aliasing has been resolved.
+	 */
+
+	ip->i_devvp = ump->um_devvp;
+	vref(ip->i_devvp);
+	genfs_node_init(vp, &lfs_genfsops);
+	uvm_vnp_setsize(vp, ip->i_size);
+
+	/* Initialize hiblk from file size */
+	ip->i_lfs_hiblk = lblkno(ip->i_lfs, ip->i_size + ip->i_lfs->lfs_bsize - 1) - 1;
+
+	*vpp = vp;
+}
+
+/*
+ * Resize the filesystem to contain the specified number of segments.
+ */
+int
+lfs_resize_fs(struct lfs *fs, int newnsegs)
+{
+	SEGUSE *sup;
+	struct buf *bp, *obp;
+	daddr_t olast, nlast, ilast, noff, start, end;
+	struct vnode *ivp;
+	struct inode *ip;
+	int error, badnews, inc, oldnsegs;
+	int sbbytes, csbbytes, gain, cgain;
+	int i;
+
+	/* Only support v2 and up */
+	if (fs->lfs_version < 2)
+		return EOPNOTSUPP;
+
+	/* If we're doing nothing, do it fast */
+	oldnsegs = fs->lfs_nseg;
+	if (newnsegs == oldnsegs)
+		return 0;
+
+	/* We always have to have two superblocks */
+	if (newnsegs <= dtosn(fs, fs->lfs_sboffs[1]))
+		return EFBIG;
+
+	ivp = fs->lfs_ivnode;
+	ip = VTOI(ivp);
+	error = 0;
+
+	/* Take the segment lock so no one else calls lfs_newseg() */
+	lfs_seglock(fs, SEGM_PROT);
+
+	/*
+	 * Make sure the segments we're going to be losing, if any,
+	 * are in fact empty.  We hold the seglock, so their status
+	 * cannot change underneath us.  Count the superblocks we lose,
+	 * while we're at it.
+	 */
+	sbbytes = csbbytes = 0;
+	cgain = 0;
+	for (i = newnsegs; i < oldnsegs; i++) {
+		LFS_SEGENTRY(sup, fs, i, bp);
+		badnews = sup->su_nbytes || !(sup->su_flags & SEGUSE_INVAL);
+		if (sup->su_flags & SEGUSE_SUPERBLOCK)
+			sbbytes += LFS_SBPAD;
+		if (!(sup->su_flags & SEGUSE_DIRTY)) {
+			++cgain;
+			if (sup->su_flags & SEGUSE_SUPERBLOCK)
+				csbbytes += LFS_SBPAD;
+		}
+		brelse(bp, 0);
+		if (badnews) {
+			error = EBUSY;
+			goto out;
+		}
+	}
+
+	/* Note old and new segment table endpoints, and old ifile size */
+	olast = fs->lfs_cleansz + fs->lfs_segtabsz;
+	nlast = howmany(newnsegs, fs->lfs_sepb) + fs->lfs_cleansz;
+	ilast = ivp->v_size >> fs->lfs_bshift;
+	noff = nlast - olast;
+
+	/*
+	 * Make sure no one can use the Ifile while we change it around.
+	 * Even after taking the iflock we need to make sure no one still
+	 * is holding Ifile buffers, so we get each one, to drain them.
+	 * (XXX this could be done better.)
+	 */
+	rw_enter(&fs->lfs_iflock, RW_WRITER);
+	vn_lock(ivp, LK_EXCLUSIVE | LK_RETRY);
+	for (i = 0; i < ilast; i++) {
+		bread(ivp, i, fs->lfs_bsize, NOCRED, 0, &bp);
+		brelse(bp, 0);
+	}
+
+	/* Allocate new Ifile blocks */
+	for (i = ilast; i < ilast + noff; i++) {
+		if (lfs_balloc(ivp, i * fs->lfs_bsize, fs->lfs_bsize, NOCRED, 0,
+			       &bp) != 0)
+			panic("balloc extending ifile");
+		memset(bp->b_data, 0, fs->lfs_bsize);
+		VOP_BWRITE(bp->b_vp, bp);
+	}
+
+	/* Register new ifile size */
+	ip->i_size += noff * fs->lfs_bsize; 
+	ip->i_ffs1_size = ip->i_size;
+	uvm_vnp_setsize(ivp, ip->i_size);
+
+	/* Copy the inode table to its new position */
+	if (noff != 0) {
+		if (noff < 0) {
+			start = nlast;
+			end = ilast + noff;
+			inc = 1;
+		} else {
+			start = ilast + noff - 1;
+			end = nlast - 1;
+			inc = -1;
+		}
+		for (i = start; i != end; i += inc) {
+			if (bread(ivp, i, fs->lfs_bsize, NOCRED,
+			    B_MODIFY, &bp) != 0)
+				panic("resize: bread dst blk failed");
+			if (bread(ivp, i - noff, fs->lfs_bsize,
+			    NOCRED, 0, &obp))
+				panic("resize: bread src blk failed");
+			memcpy(bp->b_data, obp->b_data, fs->lfs_bsize);
+			VOP_BWRITE(bp->b_vp, bp);
+			brelse(obp, 0);
+		}
+	}
+
+	/* If we are expanding, write the new empty SEGUSE entries */
+	if (newnsegs > oldnsegs) {
+		for (i = oldnsegs; i < newnsegs; i++) {
+			if ((error = bread(ivp, i / fs->lfs_sepb +
+					   fs->lfs_cleansz, fs->lfs_bsize,
+					   NOCRED, B_MODIFY, &bp)) != 0)
+				panic("lfs: ifile read: %d", error);
+			while ((i + 1) % fs->lfs_sepb && i < newnsegs) {
+				sup = &((SEGUSE *)bp->b_data)[i % fs->lfs_sepb];
+				memset(sup, 0, sizeof(*sup));
+				i++;
+			}
+			VOP_BWRITE(bp->b_vp, bp);
+		}
+	}
+
+	/* Zero out unused superblock offsets */
+	for (i = 2; i < LFS_MAXNUMSB; i++)
+		if (dtosn(fs, fs->lfs_sboffs[i]) >= newnsegs)
+			fs->lfs_sboffs[i] = 0x0;
+
+	/*
+	 * Correct superblock entries that depend on fs size.
+	 * The computations of these are as follows:
+	 *
+	 * size  = segtod(fs, nseg)
+	 * dsize = segtod(fs, nseg - minfreeseg) - btofsb(#super * LFS_SBPAD)
+	 * bfree = dsize - btofsb(fs, bsize * nseg / 2) - blocks_actually_used
+	 * avail = segtod(fs, nclean) - btofsb(#clean_super * LFS_SBPAD)
+	 *         + (segtod(fs, 1) - (offset - curseg))
+	 *	   - segtod(fs, minfreeseg - (minfreeseg / 2))
+	 *
+	 * XXX - we should probably adjust minfreeseg as well.
+	 */
+	gain = (newnsegs - oldnsegs);
+	fs->lfs_nseg = newnsegs;
+	fs->lfs_segtabsz = nlast - fs->lfs_cleansz;
+	fs->lfs_size += gain * btofsb(fs, fs->lfs_ssize);
+	fs->lfs_dsize += gain * btofsb(fs, fs->lfs_ssize) - btofsb(fs, sbbytes);
+	fs->lfs_bfree += gain * btofsb(fs, fs->lfs_ssize) - btofsb(fs, sbbytes)
+		       - gain * btofsb(fs, fs->lfs_bsize / 2);
+	if (gain > 0) {
+		fs->lfs_nclean += gain;
+		fs->lfs_avail += gain * btofsb(fs, fs->lfs_ssize);
+	} else {
+		fs->lfs_nclean -= cgain;
+		fs->lfs_avail -= cgain * btofsb(fs, fs->lfs_ssize) -
+				 btofsb(fs, csbbytes);
+	}
+
+	/* Resize segment flag cache */
+	fs->lfs_suflags[0] = (u_int32_t *)realloc(fs->lfs_suflags[0],
+						  fs->lfs_nseg * sizeof(u_int32_t),
+						  M_SEGMENT, M_WAITOK);
+	fs->lfs_suflags[1] = (u_int32_t *)realloc(fs->lfs_suflags[1],
+						  fs->lfs_nseg * sizeof(u_int32_t),
+						  M_SEGMENT, M_WAITOK);
+	for (i = oldnsegs; i < newnsegs; i++)
+		fs->lfs_suflags[0][i] = fs->lfs_suflags[1][i] = 0x0;
+
+	/* Truncate Ifile if necessary */
+	if (noff < 0)
+		lfs_truncate(ivp, ivp->v_size + (noff << fs->lfs_bshift), 0,
+		    NOCRED);
+
+	/* Update cleaner info so the cleaner can die */
+	bread(ivp, 0, fs->lfs_bsize, NOCRED, B_MODIFY, &bp);
+	((CLEANERINFO *)bp->b_data)->clean = fs->lfs_nclean;
+	((CLEANERINFO *)bp->b_data)->dirty = fs->lfs_nseg - fs->lfs_nclean;
+	VOP_BWRITE(bp->b_vp, bp);
+
+	/* Let Ifile accesses proceed */
+	VOP_UNLOCK(ivp);
+	rw_exit(&fs->lfs_iflock);
+
+    out:
+	lfs_segunlock(fs);
+	return error;
+}
diff --git a/sys/ufs/lfs/lfs_vnops.c b/sys/ufs/lfs/lfs_vnops.c
new file mode 100644
index 000000000..f30a5d20c
--- /dev/null
+++ b/sys/ufs/lfs/lfs_vnops.c
@@ -0,0 +1,2478 @@
+/*	$NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $	*/
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1986, 1989, 1991, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)lfs_vnops.c	8.13 (Berkeley) 6/10/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $");
+
+#ifdef _KERNEL_OPT
+#include "opt_compat_netbsd.h"
+#include "opt_uvm_page_trkown.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/pool.h>
+#include <sys/signalvar.h>
+#include <sys/kauth.h>
+#include <sys/syslog.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_pmap.h>
+#include <uvm/uvm_stat.h>
+#include <uvm/uvm_pager.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+extern pid_t lfs_writer_daemon;
+int lfs_ignore_lazy_sync = 1;
+
+/* Global vfs data structures for lfs. */
+int (**lfs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc lfs_vnodeop_entries[] = {
+	{ &vop_default_desc, vn_default_error },
+	{ &vop_lookup_desc, ufs_lookup },		/* lookup */
+	{ &vop_create_desc, lfs_create },		/* create */
+	{ &vop_whiteout_desc, ufs_whiteout },		/* whiteout */
+	{ &vop_mknod_desc, lfs_mknod },			/* mknod */
+	{ &vop_open_desc, ufs_open },			/* open */
+	{ &vop_close_desc, lfs_close },			/* close */
+	{ &vop_access_desc, ufs_access },		/* access */
+	{ &vop_getattr_desc, lfs_getattr },		/* getattr */
+	{ &vop_setattr_desc, lfs_setattr },		/* setattr */
+	{ &vop_read_desc, lfs_read },			/* read */
+	{ &vop_write_desc, lfs_write },			/* write */
+	{ &vop_ioctl_desc, ufs_ioctl },			/* ioctl */
+	{ &vop_fcntl_desc, lfs_fcntl },			/* fcntl */
+	{ &vop_poll_desc, ufs_poll },			/* poll */
+	{ &vop_kqfilter_desc, genfs_kqfilter },		/* kqfilter */
+	{ &vop_revoke_desc, ufs_revoke },		/* revoke */
+	{ &vop_mmap_desc, lfs_mmap },			/* mmap */
+	{ &vop_fsync_desc, lfs_fsync },			/* fsync */
+	{ &vop_seek_desc, ufs_seek },			/* seek */
+	{ &vop_remove_desc, lfs_remove },		/* remove */
+	{ &vop_link_desc, lfs_link },			/* link */
+	{ &vop_rename_desc, lfs_rename },		/* rename */
+	{ &vop_mkdir_desc, lfs_mkdir },			/* mkdir */
+	{ &vop_rmdir_desc, lfs_rmdir },			/* rmdir */
+	{ &vop_symlink_desc, lfs_symlink },		/* symlink */
+	{ &vop_readdir_desc, ufs_readdir },		/* readdir */
+	{ &vop_readlink_desc, ufs_readlink },		/* readlink */
+	{ &vop_abortop_desc, ufs_abortop },		/* abortop */
+	{ &vop_inactive_desc, lfs_inactive },		/* inactive */
+	{ &vop_reclaim_desc, lfs_reclaim },		/* reclaim */
+	{ &vop_lock_desc, ufs_lock },			/* lock */
+	{ &vop_unlock_desc, ufs_unlock },		/* unlock */
+	{ &vop_bmap_desc, ufs_bmap },			/* bmap */
+	{ &vop_strategy_desc, lfs_strategy },		/* strategy */
+	{ &vop_print_desc, ufs_print },			/* print */
+	{ &vop_islocked_desc, ufs_islocked },		/* islocked */
+	{ &vop_pathconf_desc, ufs_pathconf },		/* pathconf */
+	{ &vop_advlock_desc, ufs_advlock },		/* advlock */
+	{ &vop_bwrite_desc, lfs_bwrite },		/* bwrite */
+	{ &vop_getpages_desc, lfs_getpages },		/* getpages */
+	{ &vop_putpages_desc, lfs_putpages },		/* putpages */
+	{ NULL, NULL }
+};
+const struct vnodeopv_desc lfs_vnodeop_opv_desc =
+	{ &lfs_vnodeop_p, lfs_vnodeop_entries };
+
+int (**lfs_specop_p)(void *);
+const struct vnodeopv_entry_desc lfs_specop_entries[] = {
+	{ &vop_default_desc, vn_default_error },
+	{ &vop_lookup_desc, spec_lookup },		/* lookup */
+	{ &vop_create_desc, spec_create },		/* create */
+	{ &vop_mknod_desc, spec_mknod },		/* mknod */
+	{ &vop_open_desc, spec_open },			/* open */
+	{ &vop_close_desc, lfsspec_close },		/* close */
+	{ &vop_access_desc, ufs_access },		/* access */
+	{ &vop_getattr_desc, lfs_getattr },		/* getattr */
+	{ &vop_setattr_desc, lfs_setattr },		/* setattr */
+	{ &vop_read_desc, ufsspec_read },		/* read */
+	{ &vop_write_desc, ufsspec_write },		/* write */
+	{ &vop_ioctl_desc, spec_ioctl },		/* ioctl */
+	{ &vop_fcntl_desc, ufs_fcntl },			/* fcntl */
+	{ &vop_poll_desc, spec_poll },			/* poll */
+	{ &vop_kqfilter_desc, spec_kqfilter },		/* kqfilter */
+	{ &vop_revoke_desc, spec_revoke },		/* revoke */
+	{ &vop_mmap_desc, spec_mmap },			/* mmap */
+	{ &vop_fsync_desc, spec_fsync },		/* fsync */
+	{ &vop_seek_desc, spec_seek },			/* seek */
+	{ &vop_remove_desc, spec_remove },		/* remove */
+	{ &vop_link_desc, spec_link },			/* link */
+	{ &vop_rename_desc, spec_rename },		/* rename */
+	{ &vop_mkdir_desc, spec_mkdir },		/* mkdir */
+	{ &vop_rmdir_desc, spec_rmdir },		/* rmdir */
+	{ &vop_symlink_desc, spec_symlink },		/* symlink */
+	{ &vop_readdir_desc, spec_readdir },		/* readdir */
+	{ &vop_readlink_desc, spec_readlink },		/* readlink */
+	{ &vop_abortop_desc, spec_abortop },		/* abortop */
+	{ &vop_inactive_desc, lfs_inactive },		/* inactive */
+	{ &vop_reclaim_desc, lfs_reclaim },		/* reclaim */
+	{ &vop_lock_desc, ufs_lock },			/* lock */
+	{ &vop_unlock_desc, ufs_unlock },		/* unlock */
+	{ &vop_bmap_desc, spec_bmap },			/* bmap */
+	{ &vop_strategy_desc, spec_strategy },		/* strategy */
+	{ &vop_print_desc, ufs_print },			/* print */
+	{ &vop_islocked_desc, ufs_islocked },		/* islocked */
+	{ &vop_pathconf_desc, spec_pathconf },		/* pathconf */
+	{ &vop_advlock_desc, spec_advlock },		/* advlock */
+	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
+	{ &vop_getpages_desc, spec_getpages },		/* getpages */
+	{ &vop_putpages_desc, spec_putpages },		/* putpages */
+	{ NULL, NULL }
+};
+const struct vnodeopv_desc lfs_specop_opv_desc =
+	{ &lfs_specop_p, lfs_specop_entries };
+
+int (**lfs_fifoop_p)(void *);
+const struct vnodeopv_entry_desc lfs_fifoop_entries[] = {
+	{ &vop_default_desc, vn_default_error },
+	{ &vop_lookup_desc, vn_fifo_bypass },		/* lookup */
+	{ &vop_create_desc, vn_fifo_bypass },		/* create */
+	{ &vop_mknod_desc, vn_fifo_bypass },		/* mknod */
+	{ &vop_open_desc, vn_fifo_bypass },		/* open */
+	{ &vop_close_desc, lfsfifo_close },		/* close */
+	{ &vop_access_desc, ufs_access },		/* access */
+	{ &vop_getattr_desc, lfs_getattr },		/* getattr */
+	{ &vop_setattr_desc, lfs_setattr },		/* setattr */
+	{ &vop_read_desc, ufsfifo_read },		/* read */
+	{ &vop_write_desc, ufsfifo_write },		/* write */
+	{ &vop_ioctl_desc, vn_fifo_bypass },		/* ioctl */
+	{ &vop_fcntl_desc, ufs_fcntl },			/* fcntl */
+	{ &vop_poll_desc, vn_fifo_bypass },		/* poll */
+	{ &vop_kqfilter_desc, vn_fifo_bypass },		/* kqfilter */
+	{ &vop_revoke_desc, vn_fifo_bypass },		/* revoke */
+	{ &vop_mmap_desc, vn_fifo_bypass },		/* mmap */
+	{ &vop_fsync_desc, vn_fifo_bypass },		/* fsync */
+	{ &vop_seek_desc, vn_fifo_bypass },		/* seek */
+	{ &vop_remove_desc, vn_fifo_bypass },		/* remove */
+	{ &vop_link_desc, vn_fifo_bypass },		/* link */
+	{ &vop_rename_desc, vn_fifo_bypass },		/* rename */
+	{ &vop_mkdir_desc, vn_fifo_bypass },		/* mkdir */
+	{ &vop_rmdir_desc, vn_fifo_bypass },		/* rmdir */
+	{ &vop_symlink_desc, vn_fifo_bypass },		/* symlink */
+	{ &vop_readdir_desc, vn_fifo_bypass },		/* readdir */
+	{ &vop_readlink_desc, vn_fifo_bypass },		/* readlink */
+	{ &vop_abortop_desc, vn_fifo_bypass },		/* abortop */
+	{ &vop_inactive_desc, lfs_inactive },		/* inactive */
+	{ &vop_reclaim_desc, lfs_reclaim },		/* reclaim */
+	{ &vop_lock_desc, ufs_lock },			/* lock */
+	{ &vop_unlock_desc, ufs_unlock },		/* unlock */
+	{ &vop_bmap_desc, vn_fifo_bypass },		/* bmap */
+	{ &vop_strategy_desc, vn_fifo_bypass },		/* strategy */
+	{ &vop_print_desc, ufs_print },			/* print */
+	{ &vop_islocked_desc, ufs_islocked },		/* islocked */
+	{ &vop_pathconf_desc, vn_fifo_bypass },		/* pathconf */
+	{ &vop_advlock_desc, vn_fifo_bypass },		/* advlock */
+	{ &vop_bwrite_desc, lfs_bwrite },		/* bwrite */
+	{ &vop_putpages_desc, vn_fifo_bypass },		/* putpages */
+	{ NULL, NULL }
+};
+const struct vnodeopv_desc lfs_fifoop_opv_desc =
+	{ &lfs_fifoop_p, lfs_fifoop_entries };
+
+static int check_dirty(struct lfs *, struct vnode *, off_t, off_t, off_t, int, int, struct vm_page **);
+
+#define	LFS_READWRITE
+#include <ufs/ufs/ufs_readwrite.c>
+#undef	LFS_READWRITE
+
+/*
+ * Synch an open file.
+ */
+/* ARGSUSED */
+int
+lfs_fsync(void *v)
+{
+	struct vop_fsync_args /* {
+		struct vnode *a_vp;
+		kauth_cred_t a_cred;
+		int a_flags;
+		off_t offlo;
+		off_t offhi;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	int error, wait;
+	struct inode *ip = VTOI(vp);
+	struct lfs *fs = ip->i_lfs;
+
+	/* If we're mounted read-only, don't try to sync. */
+	if (fs->lfs_ronly)
+		return 0;
+
+	/* If a removed vnode is being cleaned, no need to sync here. */
+	if ((ap->a_flags & FSYNC_RECLAIM) != 0 && ip->i_mode == 0)
+		return 0;
+
+	/*
+	 * Trickle sync simply adds this vnode to the pager list, as if
+	 * the pagedaemon had requested a pageout.
+	 */
+	if (ap->a_flags & FSYNC_LAZY) {
+		if (lfs_ignore_lazy_sync == 0) {
+			mutex_enter(&lfs_lock);
+			if (!(ip->i_flags & IN_PAGING)) {
+				ip->i_flags |= IN_PAGING;
+				TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip,
+						  i_lfs_pchain);
+			}
+			wakeup(&lfs_writer_daemon);
+			mutex_exit(&lfs_lock);
+		}
+		return 0;
+	}
+
+	/*
+	 * If a vnode is bring cleaned, flush it out before we try to
+	 * reuse it.  This prevents the cleaner from writing files twice
+	 * in the same partial segment, causing an accounting underflow.
+	 */
+	if (ap->a_flags & FSYNC_RECLAIM && ip->i_flags & IN_CLEANING) {
+		lfs_vflush(vp);
+	}
+
+	wait = (ap->a_flags & FSYNC_WAIT);
+	do {
+		mutex_enter(vp->v_interlock);
+		error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
+				     round_page(ap->a_offhi),
+				     PGO_CLEANIT | (wait ? PGO_SYNCIO : 0));
+		if (error == EAGAIN) {
+			mutex_enter(&lfs_lock);
+			mtsleep(&fs->lfs_avail, PCATCH | PUSER, "lfs_fsync",
+				hz / 100 + 1, &lfs_lock);
+			mutex_exit(&lfs_lock);
+		}
+	} while (error == EAGAIN);
+	if (error)
+		return error;
+
+	if ((ap->a_flags & FSYNC_DATAONLY) == 0)
+		error = lfs_update(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
+
+	if (error == 0 && ap->a_flags & FSYNC_CACHE) {
+		int l = 0;
+		error = VOP_IOCTL(ip->i_devvp, DIOCCACHESYNC, &l, FWRITE,
+				  curlwp->l_cred);
+	}
+	if (wait && !VPISEMPTY(vp))
+		LFS_SET_UINO(ip, IN_MODIFIED);
+
+	return error;
+}
+
+/*
+ * Take IN_ADIROP off, then call ufs_inactive.
+ */
+int
+lfs_inactive(void *v)
+{
+	struct vop_inactive_args /* {
+		struct vnode *a_vp;
+	} */ *ap = v;
+
+	lfs_unmark_vnode(ap->a_vp);
+
+	/*
+	 * The Ifile is only ever inactivated on unmount.
+	 * Streamline this process by not giving it more dirty blocks.
+	 */
+	if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM) {
+		mutex_enter(&lfs_lock);
+		LFS_CLR_UINO(VTOI(ap->a_vp), IN_ALLMOD);
+		mutex_exit(&lfs_lock);
+		VOP_UNLOCK(ap->a_vp);
+		return 0;
+	}
+
+	return ufs_inactive(v);
+}
+
+/*
+ * These macros are used to bracket UFS directory ops, so that we can
+ * identify all the pages touched during directory ops which need to
+ * be ordered and flushed atomically, so that they may be recovered.
+ *
+ * Because we have to mark nodes VU_DIROP in order to prevent
+ * the cache from reclaiming them while a dirop is in progress, we must
+ * also manage the number of nodes so marked (otherwise we can run out).
+ * We do this by setting lfs_dirvcount to the number of marked vnodes; it
+ * is decremented during segment write, when VU_DIROP is taken off.
+ */
+#define	MARK_VNODE(vp)			lfs_mark_vnode(vp)
+#define	UNMARK_VNODE(vp)		lfs_unmark_vnode(vp)
+#define	SET_DIROP_CREATE(dvp, vpp)	lfs_set_dirop_create((dvp), (vpp))
+#define	SET_DIROP_REMOVE(dvp, vp)	lfs_set_dirop((dvp), (vp))
+static int lfs_set_dirop_create(struct vnode *, struct vnode **);
+static int lfs_set_dirop(struct vnode *, struct vnode *);
+
+static int
+lfs_set_dirop(struct vnode *dvp, struct vnode *vp)
+{
+	struct lfs *fs;
+	int error;
+
+	KASSERT(VOP_ISLOCKED(dvp));
+	KASSERT(vp == NULL || VOP_ISLOCKED(vp));
+
+	fs = VTOI(dvp)->i_lfs;
+
+	ASSERT_NO_SEGLOCK(fs);
+	/*
+	 * LFS_NRESERVE calculates direct and indirect blocks as well
+	 * as an inode block; an overestimate in most cases.
+	 */
+	if ((error = lfs_reserve(fs, dvp, vp, LFS_NRESERVE(fs))) != 0)
+		return (error);
+
+    restart:
+	mutex_enter(&lfs_lock);
+	if (fs->lfs_dirops == 0) {
+		mutex_exit(&lfs_lock);
+		lfs_check(dvp, LFS_UNUSED_LBN, 0);
+		mutex_enter(&lfs_lock);
+	}
+	while (fs->lfs_writer) {
+		error = mtsleep(&fs->lfs_dirops, (PRIBIO + 1) | PCATCH,
+		    "lfs_sdirop", 0, &lfs_lock);
+		if (error == EINTR) {
+			mutex_exit(&lfs_lock);
+			goto unreserve;
+		}
+	}
+	if (lfs_dirvcount > LFS_MAX_DIROP && fs->lfs_dirops == 0) {
+		wakeup(&lfs_writer_daemon);
+		mutex_exit(&lfs_lock);
+		preempt();
+		goto restart;
+	}
+
+	if (lfs_dirvcount > LFS_MAX_DIROP) {
+		mutex_exit(&lfs_lock);
+		DLOG((DLOG_DIROP, "lfs_set_dirop: sleeping with dirops=%d, "
+		      "dirvcount=%d\n", fs->lfs_dirops, lfs_dirvcount));
+		if ((error = mtsleep(&lfs_dirvcount,
+		    PCATCH | PUSER | PNORELOCK, "lfs_maxdirop", 0,
+		    &lfs_lock)) != 0) {
+			goto unreserve;
+		}
+		goto restart;
+	}
+
+	++fs->lfs_dirops;
+	fs->lfs_doifile = 1;
+	mutex_exit(&lfs_lock);
+
+	/* Hold a reference so SET_ENDOP will be happy */
+	vref(dvp);
+	if (vp) {
+		vref(vp);
+		MARK_VNODE(vp);
+	}
+
+	MARK_VNODE(dvp);
+	return 0;
+
+  unreserve:
+	lfs_reserve(fs, dvp, vp, -LFS_NRESERVE(fs));
+	return error;
+}
+
+/*
+ * Get a new vnode *before* adjusting the dirop count, to avoid a deadlock
+ * in getnewvnode(), if we have a stacked filesystem mounted on top
+ * of us.
+ *
+ * NB: this means we have to clear the new vnodes on error.  Fortunately
+ * SET_ENDOP is there to do that for us.
+ */
+static int
+lfs_set_dirop_create(struct vnode *dvp, struct vnode **vpp)
+{
+	int error;
+	struct lfs *fs;
+
+	fs = VFSTOUFS(dvp->v_mount)->um_lfs;
+	ASSERT_NO_SEGLOCK(fs);
+	if (fs->lfs_ronly)
+		return EROFS;
+	if (vpp == NULL) {
+		return lfs_set_dirop(dvp, NULL);
+	}
+	error = getnewvnode(VT_LFS, dvp->v_mount, lfs_vnodeop_p, NULL, vpp);
+	if (error) {
+		DLOG((DLOG_ALLOC, "lfs_set_dirop_create: dvp %p error %d\n",
+		      dvp, error));
+		return error;
+	}
+	if ((error = lfs_set_dirop(dvp, NULL)) != 0) {
+		ungetnewvnode(*vpp);
+		*vpp = NULL;
+		return error;
+	}
+	return 0;
+}
+
+#define	SET_ENDOP_BASE(fs, dvp, str)					\
+	do {								\
+		mutex_enter(&lfs_lock);				\
+		--(fs)->lfs_dirops;					\
+		if (!(fs)->lfs_dirops) {				\
+			if ((fs)->lfs_nadirop) {			\
+				panic("SET_ENDOP: %s: no dirops but "	\
+					" nadirop=%d", (str),		\
+					(fs)->lfs_nadirop);		\
+			}						\
+			wakeup(&(fs)->lfs_writer);			\
+			mutex_exit(&lfs_lock);				\
+			lfs_check((dvp), LFS_UNUSED_LBN, 0);		\
+		} else							\
+			mutex_exit(&lfs_lock);				\
+	} while(0)
+#define SET_ENDOP_CREATE(fs, dvp, nvpp, str)				\
+	do {								\
+		UNMARK_VNODE(dvp);					\
+		if (nvpp && *nvpp)					\
+			UNMARK_VNODE(*nvpp);				\
+		/* Check for error return to stem vnode leakage */	\
+		if (nvpp && *nvpp && !((*nvpp)->v_uflag & VU_DIROP))	\
+			ungetnewvnode(*(nvpp));				\
+		SET_ENDOP_BASE((fs), (dvp), (str));			\
+		lfs_reserve((fs), (dvp), NULL, -LFS_NRESERVE(fs));	\
+		vrele(dvp);						\
+	} while(0)
+#define SET_ENDOP_CREATE_AP(ap, str)					\
+	SET_ENDOP_CREATE(VTOI((ap)->a_dvp)->i_lfs, (ap)->a_dvp,		\
+			 (ap)->a_vpp, (str))
+#define SET_ENDOP_REMOVE(fs, dvp, ovp, str)				\
+	do {								\
+		UNMARK_VNODE(dvp);					\
+		if (ovp)						\
+			UNMARK_VNODE(ovp);				\
+		SET_ENDOP_BASE((fs), (dvp), (str));			\
+		lfs_reserve((fs), (dvp), (ovp), -LFS_NRESERVE(fs));	\
+		vrele(dvp);						\
+		if (ovp)						\
+			vrele(ovp);					\
+	} while(0)
+
+void
+lfs_mark_vnode(struct vnode *vp)
+{
+	struct inode *ip = VTOI(vp);
+	struct lfs *fs = ip->i_lfs;
+
+	mutex_enter(&lfs_lock);
+	if (!(ip->i_flag & IN_ADIROP)) {
+		if (!(vp->v_uflag & VU_DIROP)) {
+			mutex_enter(vp->v_interlock);
+			(void)lfs_vref(vp);
+			++lfs_dirvcount;
+			++fs->lfs_dirvcount;
+			TAILQ_INSERT_TAIL(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+			vp->v_uflag |= VU_DIROP;
+		}
+		++fs->lfs_nadirop;
+		ip->i_flag |= IN_ADIROP;
+	} else
+		KASSERT(vp->v_uflag & VU_DIROP);
+	mutex_exit(&lfs_lock);
+}
+
+void
+lfs_unmark_vnode(struct vnode *vp)
+{
+	struct inode *ip = VTOI(vp);
+
+	if (ip && (ip->i_flag & IN_ADIROP)) {
+		KASSERT(vp->v_uflag & VU_DIROP);
+		mutex_enter(&lfs_lock);
+		--ip->i_lfs->lfs_nadirop;
+		mutex_exit(&lfs_lock);
+		ip->i_flag &= ~IN_ADIROP;
+	}
+}
+
+int
+lfs_symlink(void *v)
+{
+	struct vop_symlink_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+		struct vattr *a_vap;
+		char *a_target;
+	} */ *ap = v;
+	int error;
+
+	if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
+		vput(ap->a_dvp);
+		return error;
+	}
+	error = ufs_symlink(ap);
+	SET_ENDOP_CREATE_AP(ap, "symlink");
+	return (error);
+}
+
+int
+lfs_mknod(void *v)
+{
+	struct vop_mknod_args	/* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+		struct vattr *a_vap;
+	} */ *ap = v;
+	struct vattr *vap = ap->a_vap;
+	struct vnode **vpp = ap->a_vpp;
+	struct inode *ip;
+	int error;
+	struct mount	*mp;
+	ino_t		ino;
+	struct ufs_lookup_results *ulr;
+
+	/* XXX should handle this material another way */
+	ulr = &VTOI(ap->a_dvp)->i_crap;
+	UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));
+
+	if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
+		vput(ap->a_dvp);
+		return error;
+	}
+	error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
+			      ap->a_dvp, ulr, vpp, ap->a_cnp);
+
+	/* Either way we're done with the dirop at this point */
+	SET_ENDOP_CREATE_AP(ap, "mknod");
+
+	if (error)
+		return (error);
+
+	ip = VTOI(*vpp);
+	mp  = (*vpp)->v_mount;
+	ino = ip->i_number;
+	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+	if (vap->va_rdev != VNOVAL) {
+		/*
+		 * Want to be able to use this to make badblock
+		 * inodes, so don't truncate the dev number.
+		 */
+#if 0
+		ip->i_ffs1_rdev = ufs_rw32(vap->va_rdev,
+					   UFS_MPNEEDSWAP((*vpp)->v_mount));
+#else
+		ip->i_ffs1_rdev = vap->va_rdev;
+#endif
+	}
+
+	/*
+	 * Call fsync to write the vnode so that we don't have to deal with
+	 * flushing it when it's marked VU_DIROP|VI_XLOCK.
+	 *
+	 * XXX KS - If we can't flush we also can't call vgone(), so must
+	 * return.  But, that leaves this vnode in limbo, also not good.
+	 * Can this ever happen (barring hardware failure)?
+	 */
+	if ((error = VOP_FSYNC(*vpp, NOCRED, FSYNC_WAIT, 0, 0)) != 0) {
+		panic("lfs_mknod: couldn't fsync (ino %llu)",
+		      (unsigned long long)ino);
+		/* return (error); */
+	}
+	/*
+	 * Remove vnode so that it will be reloaded by VFS_VGET and
+	 * checked to see if it is an alias of an existing entry in
+	 * the inode cache.
+	 */
+	/* Used to be vput, but that causes us to call VOP_INACTIVE twice. */
+
+	VOP_UNLOCK(*vpp);
+	(*vpp)->v_type = VNON;
+	vgone(*vpp);
+	error = VFS_VGET(mp, ino, vpp);
+
+	if (error != 0) {
+		*vpp = NULL;
+		return (error);
+	}
+	return (0);
+}
+
+int
+lfs_create(void *v)
+{
+	struct vop_create_args	/* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+		struct vattr *a_vap;
+	} */ *ap = v;
+	int error;
+
+	if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
+		vput(ap->a_dvp);
+		return error;
+	}
+	error = ufs_create(ap);
+	SET_ENDOP_CREATE_AP(ap, "create");
+	return (error);
+}
+
+int
+lfs_mkdir(void *v)
+{
+	struct vop_mkdir_args	/* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+		struct vattr *a_vap;
+	} */ *ap = v;
+	int error;
+
+	if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
+		vput(ap->a_dvp);
+		return error;
+	}
+	error = ufs_mkdir(ap);
+	SET_ENDOP_CREATE_AP(ap, "mkdir");
+	return (error);
+}
+
+int
+lfs_remove(void *v)
+{
+	struct vop_remove_args	/* {
+		struct vnode *a_dvp;
+		struct vnode *a_vp;
+		struct componentname *a_cnp;
+	} */ *ap = v;
+	struct vnode *dvp, *vp;
+	struct inode *ip;
+	int error;
+
+	dvp = ap->a_dvp;
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	if ((error = SET_DIROP_REMOVE(dvp, vp)) != 0) {
+		if (dvp == vp)
+			vrele(vp);
+		else
+			vput(vp);
+		vput(dvp);
+		return error;
+	}
+	error = ufs_remove(ap);
+	if (ip->i_nlink == 0)
+		lfs_orphan(ip->i_lfs, ip->i_number);
+	SET_ENDOP_REMOVE(ip->i_lfs, dvp, ap->a_vp, "remove");
+	return (error);
+}
+
+int
+lfs_rmdir(void *v)
+{
+	struct vop_rmdir_args	/* {
+		struct vnodeop_desc *a_desc;
+		struct vnode *a_dvp;
+		struct vnode *a_vp;
+		struct componentname *a_cnp;
+	} */ *ap = v;
+	struct vnode *vp;
+	struct inode *ip;
+	int error;
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	if ((error = SET_DIROP_REMOVE(ap->a_dvp, ap->a_vp)) != 0) {
+		if (ap->a_dvp == vp)
+			vrele(ap->a_dvp);
+		else
+			vput(ap->a_dvp);
+		vput(vp);
+		return error;
+	}
+	error = ufs_rmdir(ap);
+	if (ip->i_nlink == 0)
+		lfs_orphan(ip->i_lfs, ip->i_number);
+	SET_ENDOP_REMOVE(ip->i_lfs, ap->a_dvp, ap->a_vp, "rmdir");
+	return (error);
+}
+
+int
+lfs_link(void *v)
+{
+	struct vop_link_args	/* {
+		struct vnode *a_dvp;
+		struct vnode *a_vp;
+		struct componentname *a_cnp;
+	} */ *ap = v;
+	int error;
+	struct vnode **vpp = NULL;
+
+	if ((error = SET_DIROP_CREATE(ap->a_dvp, vpp)) != 0) {
+		vput(ap->a_dvp);
+		return error;
+	}
+	error = ufs_link(ap);
+	SET_ENDOP_CREATE(VTOI(ap->a_dvp)->i_lfs, ap->a_dvp, vpp, "link");
+	return (error);
+}
+
+int
+lfs_rename(void *v)
+{
+	struct vop_rename_args	/* {
+		struct vnode *a_fdvp;
+		struct vnode *a_fvp;
+		struct componentname *a_fcnp;
+		struct vnode *a_tdvp;
+		struct vnode *a_tvp;
+		struct componentname *a_tcnp;
+	} */ *ap = v;
+	struct vnode *tvp, *fvp, *tdvp, *fdvp;
+	struct componentname *tcnp, *fcnp;
+	int error;
+	struct lfs *fs;
+
+	fs = VTOI(ap->a_fdvp)->i_lfs;
+	tvp = ap->a_tvp;
+	tdvp = ap->a_tdvp;
+	tcnp = ap->a_tcnp;
+	fvp = ap->a_fvp;
+	fdvp = ap->a_fdvp;
+	fcnp = ap->a_fcnp;
+
+	/*
+	 * Check for cross-device rename.
+	 * If it is, we don't want to set dirops, just error out.
+	 * (In particular note that MARK_VNODE(tdvp) will DTWT on
+	 * a cross-device rename.)
+	 *
+	 * Copied from ufs_rename.
+	 */
+	if ((fvp->v_mount != tdvp->v_mount) ||
+	    (tvp && (fvp->v_mount != tvp->v_mount))) {
+		error = EXDEV;
+		goto errout;
+	}
+
+	/*
+	 * Check to make sure we're not renaming a vnode onto itself
+	 * (deleting a hard link by renaming one name onto another);
+	 * if we are we can't recursively call VOP_REMOVE since that
+	 * would leave us with an unaccounted-for number of live dirops.
+	 *
+	 * Inline the relevant section of ufs_rename here, *before*
+	 * calling SET_DIROP_REMOVE.
+	 */
+	if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) ||
+		    (VTOI(tdvp)->i_flags & APPEND))) {
+		error = EPERM;
+		goto errout;
+	}
+	if (fvp == tvp) {
+		if (fvp->v_type == VDIR) {
+			error = EINVAL;
+			goto errout;
+		}
+
+		/* Release destination completely. */
+		VOP_ABORTOP(tdvp, tcnp);
+		vput(tdvp);
+		vput(tvp);
+
+		/* Delete source. */
+		vrele(fvp);
+		fcnp->cn_flags &= ~(MODMASK);
+		fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+		fcnp->cn_nameiop = DELETE;
+		vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+		if ((error = relookup(fdvp, &fvp, fcnp, 0))) {
+			vput(fdvp);
+			return (error);
+		}
+		return (VOP_REMOVE(fdvp, fvp, fcnp));
+	}
+
+	if ((error = SET_DIROP_REMOVE(tdvp, tvp)) != 0)
+		goto errout;
+	MARK_VNODE(fdvp);
+	MARK_VNODE(fvp);
+
+	error = ufs_rename(ap);
+	UNMARK_VNODE(fdvp);
+	UNMARK_VNODE(fvp);
+	SET_ENDOP_REMOVE(fs, tdvp, tvp, "rename");
+	return (error);
+
+  errout:
+	VOP_ABORTOP(tdvp, ap->a_tcnp); /* XXX, why not in NFS? */
+	if (tdvp == tvp)
+		vrele(tdvp);
+	else
+		vput(tdvp);
+	if (tvp)
+		vput(tvp);
+	VOP_ABORTOP(fdvp, ap->a_fcnp); /* XXX, why not in NFS? */
+	vrele(fdvp);
+	vrele(fvp);
+	return (error);
+}
+
+/* XXX hack to avoid calling ITIMES in getattr */
+int
+lfs_getattr(void *v)
+{
+	struct vop_getattr_args /* {
+		struct vnode *a_vp;
+		struct vattr *a_vap;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	struct vattr *vap = ap->a_vap;
+	struct lfs *fs = ip->i_lfs;
+	/*
+	 * Copy from inode table
+	 */
+	vap->va_fsid = ip->i_dev;
+	vap->va_fileid = ip->i_number;
+	vap->va_mode = ip->i_mode & ~IFMT;
+	vap->va_nlink = ip->i_nlink;
+	vap->va_uid = ip->i_uid;
+	vap->va_gid = ip->i_gid;
+	vap->va_rdev = (dev_t)ip->i_ffs1_rdev;
+	vap->va_size = vp->v_size;
+	vap->va_atime.tv_sec = ip->i_ffs1_atime;
+	vap->va_atime.tv_nsec = ip->i_ffs1_atimensec;
+	vap->va_mtime.tv_sec = ip->i_ffs1_mtime;
+	vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec;
+	vap->va_ctime.tv_sec = ip->i_ffs1_ctime;
+	vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec;
+	vap->va_flags = ip->i_flags;
+	vap->va_gen = ip->i_gen;
+	/* this doesn't belong here */
+	if (vp->v_type == VBLK)
+		vap->va_blocksize = BLKDEV_IOSIZE;
+	else if (vp->v_type == VCHR)
+		vap->va_blocksize = MAXBSIZE;
+	else
+		vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+	vap->va_bytes = fsbtob(fs, (u_quad_t)ip->i_lfs_effnblks);
+	vap->va_type = vp->v_type;
+	vap->va_filerev = ip->i_modrev;
+	return (0);
+}
+
+/*
+ * Check to make sure the inode blocks won't choke the buffer
+ * cache, then call ufs_setattr as usual.
+ */
+int
+lfs_setattr(void *v)
+{
+	struct vop_setattr_args /* {
+		struct vnode *a_vp;
+		struct vattr *a_vap;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+
+	lfs_check(vp, LFS_UNUSED_LBN, 0);
+	return ufs_setattr(v);
+}
+
+/*
+ * Release the block we hold on lfs_newseg wrapping.  Called on file close,
+ * or explicitly from LFCNWRAPGO.  Called with the interlock held.
+ */
+static int
+lfs_wrapgo(struct lfs *fs, struct inode *ip, int waitfor)
+{
+	if (fs->lfs_stoplwp != curlwp)
+		return EBUSY;
+
+	fs->lfs_stoplwp = NULL;
+	cv_signal(&fs->lfs_stopcv);
+
+	KASSERT(fs->lfs_nowrap > 0);
+	if (fs->lfs_nowrap <= 0) {
+		return 0;
+	}
+
+	if (--fs->lfs_nowrap == 0) {
+		log(LOG_NOTICE, "%s: re-enabled log wrap\n", fs->lfs_fsmnt);
+		wakeup(&fs->lfs_wrappass);
+		lfs_wakeup_cleaner(fs);
+	}
+	if (waitfor) {
+		mtsleep(&fs->lfs_nextseg, PCATCH | PUSER, "segment",
+		    0, &lfs_lock);
+	}
+
+	return 0;
+}
+
+/*
+ * Close called
+ */
+/* ARGSUSED */
+int
+lfs_close(void *v)
+{
+	struct vop_close_args /* {
+		struct vnode *a_vp;
+		int  a_fflag;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	struct lfs *fs = ip->i_lfs;
+
+	if ((ip->i_number == ROOTINO || ip->i_number == LFS_IFILE_INUM) &&
+	    fs->lfs_stoplwp == curlwp) {
+		mutex_enter(&lfs_lock);
+		log(LOG_NOTICE, "lfs_close: releasing log wrap control\n");
+		lfs_wrapgo(fs, ip, 0);
+		mutex_exit(&lfs_lock);
+	}
+
+	if (vp == ip->i_lfs->lfs_ivnode &&
+	    vp->v_mount->mnt_iflag & IMNT_UNMOUNT)
+		return 0;
+
+	if (vp->v_usecount > 1 && vp != ip->i_lfs->lfs_ivnode) {
+		LFS_ITIMES(ip, NULL, NULL, NULL);
+	}
+	return (0);
+}
+
+/*
+ * Close wrapper for special devices.
+ *
+ * Update the times on the inode then do device close.
+ */
+int
+lfsspec_close(void *v)
+{
+	struct vop_close_args /* {
+		struct vnode	*a_vp;
+		int		a_fflag;
+		kauth_cred_t	a_cred;
+	} */ *ap = v;
+	struct vnode	*vp;
+	struct inode	*ip;
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	if (vp->v_usecount > 1) {
+		LFS_ITIMES(ip, NULL, NULL, NULL);
+	}
+	return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap));
+}
+
+/*
+ * Close wrapper for fifo's.
+ *
+ * Update the times on the inode then do device close.
+ */
+int
+lfsfifo_close(void *v)
+{
+	struct vop_close_args /* {
+		struct vnode	*a_vp;
+		int		a_fflag;
+		kauth_cred_	a_cred;
+	} */ *ap = v;
+	struct vnode	*vp;
+	struct inode	*ip;
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	if (ap->a_vp->v_usecount > 1) {
+		LFS_ITIMES(ip, NULL, NULL, NULL);
+	}
+	return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap));
+}
+
+/*
+ * Reclaim an inode so that it can be used for other purposes.
+ */
+
+int
+lfs_reclaim(void *v)
+{
+	struct vop_reclaim_args /* {
+		struct vnode *a_vp;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	struct lfs *fs = ip->i_lfs;
+	int error;
+
+	/*
+	 * The inode must be freed and updated before being removed
+	 * from its hash chain.  Other threads trying to gain a hold
+	 * on the inode will be stalled because it is locked (VI_XLOCK).
+	 */
+	if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
+		lfs_vfree(vp, ip->i_number, ip->i_omode);
+
+	mutex_enter(&lfs_lock);
+	LFS_CLR_UINO(ip, IN_ALLMOD);
+	mutex_exit(&lfs_lock);
+	if ((error = ufs_reclaim(vp)))
+		return (error);
+
+	/*
+	 * Take us off the paging and/or dirop queues if we were on them.
+	 * We shouldn't be on them.
+	 */
+	mutex_enter(&lfs_lock);
+	if (ip->i_flags & IN_PAGING) {
+		log(LOG_WARNING, "%s: reclaimed vnode is IN_PAGING\n",
+		    fs->lfs_fsmnt);
+		ip->i_flags &= ~IN_PAGING;
+		TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
+	}
+	if (vp->v_uflag & VU_DIROP) {
+		panic("reclaimed vnode is VU_DIROP");
+		vp->v_uflag &= ~VU_DIROP;
+		TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+	}
+	mutex_exit(&lfs_lock);
+
+	pool_put(&lfs_dinode_pool, ip->i_din.ffs1_din);
+	lfs_deregister_all(vp);
+	pool_put(&lfs_inoext_pool, ip->inode_ext.lfs);
+	ip->inode_ext.lfs = NULL;
+	genfs_node_destroy(vp);
+	pool_put(&lfs_inode_pool, vp->v_data);
+	vp->v_data = NULL;
+	return (0);
+}
+
+/*
+ * Read a block from a storage device.
+ * In order to avoid reading blocks that are in the process of being
+ * written by the cleaner---and hence are not mutexed by the normal
+ * buffer cache / page cache mechanisms---check for collisions before
+ * reading.
+ *
+ * We inline ufs_strategy to make sure that the VOP_BMAP occurs *before*
+ * the active cleaner test.
+ *
+ * XXX This code assumes that lfs_markv makes synchronous checkpoints.
+ */
+int
+lfs_strategy(void *v)
+{
+	struct vop_strategy_args /* {
+		struct vnode *a_vp;
+		struct buf *a_bp;
+	} */ *ap = v;
+	struct buf	*bp;
+	struct lfs	*fs;
+	struct vnode	*vp;
+	struct inode	*ip;
+	daddr_t		tbn;
+	int		i, sn, error, slept;
+
+	bp = ap->a_bp;
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	fs = ip->i_lfs;
+
+	/* lfs uses its strategy routine only for read */
+	KASSERT(bp->b_flags & B_READ);
+
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		panic("lfs_strategy: spec");
+	KASSERT(bp->b_bcount != 0);
+	if (bp->b_blkno == bp->b_lblkno) {
+		error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
+				 NULL);
+		if (error) {
+			bp->b_error = error;
+			bp->b_resid = bp->b_bcount;
+			biodone(bp);
+			return (error);
+		}
+		if ((long)bp->b_blkno == -1) /* no valid data */
+			clrbuf(bp);
+	}
+	if ((long)bp->b_blkno < 0) { /* block is not on disk */
+		bp->b_resid = bp->b_bcount;
+		biodone(bp);
+		return (0);
+	}
+
+	slept = 1;
+	mutex_enter(&lfs_lock);
+	while (slept && fs->lfs_seglock) {
+		mutex_exit(&lfs_lock);
+		/*
+		 * Look through list of intervals.
+		 * There will only be intervals to look through
+		 * if the cleaner holds the seglock.
+		 * Since the cleaner is synchronous, we can trust
+		 * the list of intervals to be current.
+		 */
+		tbn = dbtofsb(fs, bp->b_blkno);
+		sn = dtosn(fs, tbn);
+		slept = 0;
+		for (i = 0; i < fs->lfs_cleanind; i++) {
+			if (sn == dtosn(fs, fs->lfs_cleanint[i]) &&
+			    tbn >= fs->lfs_cleanint[i]) {
+				DLOG((DLOG_CLEAN,
+				      "lfs_strategy: ino %d lbn %" PRId64
+				      " ind %d sn %d fsb %" PRIx32
+				      " given sn %d fsb %" PRIx64 "\n",
+				      ip->i_number, bp->b_lblkno, i,
+				      dtosn(fs, fs->lfs_cleanint[i]),
+				      fs->lfs_cleanint[i], sn, tbn));
+				DLOG((DLOG_CLEAN,
+				      "lfs_strategy: sleeping on ino %d lbn %"
+				      PRId64 "\n", ip->i_number, bp->b_lblkno));
+				mutex_enter(&lfs_lock);
+				if (LFS_SEGLOCK_HELD(fs) && fs->lfs_iocount) {
+					/* Cleaner can't wait for itself */
+					mtsleep(&fs->lfs_iocount,
+						(PRIBIO + 1) | PNORELOCK,
+						"clean2", 0,
+						&lfs_lock);
+					slept = 1;
+					break;
+				} else if (fs->lfs_seglock) {
+					mtsleep(&fs->lfs_seglock,
+						(PRIBIO + 1) | PNORELOCK,
+						"clean1", 0,
+						&lfs_lock);
+					slept = 1;
+					break;
+				}
+				mutex_exit(&lfs_lock);
+			}
+		}
+		mutex_enter(&lfs_lock);
+	}
+	mutex_exit(&lfs_lock);
+
+	vp = ip->i_devvp;
+	VOP_STRATEGY(vp, bp);
+	return (0);
+}
+
+void
+lfs_flush_dirops(struct lfs *fs)
+{
+	struct inode *ip, *nip;
+	struct vnode *vp;
+	extern int lfs_dostats;
+	struct segment *sp;
+
+	ASSERT_MAYBE_SEGLOCK(fs);
+	KASSERT(fs->lfs_nadirop == 0);
+
+	if (fs->lfs_ronly)
+		return;
+
+	mutex_enter(&lfs_lock);
+	if (TAILQ_FIRST(&fs->lfs_dchainhd) == NULL) {
+		mutex_exit(&lfs_lock);
+		return;
+	} else
+		mutex_exit(&lfs_lock);
+
+	if (lfs_dostats)
+		++lfs_stats.flush_invoked;
+
+	/*
+	 * Inline lfs_segwrite/lfs_writevnodes, but just for dirops.
+	 * Technically this is a checkpoint (the on-disk state is valid)
+	 * even though we are leaving out all the file data.
+	 */
+	lfs_imtime(fs);
+	lfs_seglock(fs, SEGM_CKP);
+	sp = fs->lfs_sp;
+
+	/*
+	 * lfs_writevnodes, optimized to get dirops out of the way.
+	 * Only write dirops, and don't flush files' pages, only
+	 * blocks from the directories.
+	 *
+	 * We don't need to vref these files because they are
+	 * dirops and so hold an extra reference until the
+	 * segunlock clears them of that status.
+	 *
+	 * We don't need to check for IN_ADIROP because we know that
+	 * no dirops are active.
+	 *
+	 */
+	mutex_enter(&lfs_lock);
+	for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
+		nip = TAILQ_NEXT(ip, i_lfs_dchain);
+		mutex_exit(&lfs_lock);
+		vp = ITOV(ip);
+
+		KASSERT((ip->i_flag & IN_ADIROP) == 0);
+
+		/*
+		 * All writes to directories come from dirops; all
+		 * writes to files' direct blocks go through the page
+		 * cache, which we're not touching.  Reads to files
+		 * and/or directories will not be affected by writing
+		 * directory blocks inodes and file inodes.  So we don't
+		 * really need to lock.	 If we don't lock, though,
+		 * make sure that we don't clear IN_MODIFIED
+		 * unnecessarily.
+		 */
+		if (vp->v_iflag & VI_XLOCK) {
+			mutex_enter(&lfs_lock);
+			continue;
+		}
+		/* XXX see below
+		 * waslocked = VOP_ISLOCKED(vp);
+		 */
+		if (vp->v_type != VREG &&
+		    ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp))) {
+			lfs_writefile(fs, sp, vp);
+			if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
+			    !(ip->i_flag & IN_ALLMOD)) {
+			    	mutex_enter(&lfs_lock);
+				LFS_SET_UINO(ip, IN_MODIFIED);
+			    	mutex_exit(&lfs_lock);
+			}
+		}
+		KDASSERT(ip->i_number != LFS_IFILE_INUM);
+		(void) lfs_writeinode(fs, sp, ip);
+		mutex_enter(&lfs_lock);
+		/*
+		 * XXX
+		 * LK_EXCLOTHER is dead -- what is intended here?
+		 * if (waslocked == LK_EXCLOTHER)
+		 *	LFS_SET_UINO(ip, IN_MODIFIED);
+		 */
+	}
+	mutex_exit(&lfs_lock);
+	/* We've written all the dirops there are */
+	((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT);
+	lfs_finalize_fs_seguse(fs);
+	(void) lfs_writeseg(fs, sp);
+	lfs_segunlock(fs);
+}
+
+/*
+ * Flush all vnodes for which the pagedaemon has requested pageouts.
+ * Skip over any files that are marked VU_DIROP (since lfs_flush_dirop()
+ * has just run, this would be an error).  If we have to skip a vnode
+ * for any reason, just skip it; if we have to wait for the cleaner,
+ * abort.  The writer daemon will call us again later.
+ */
+void
+lfs_flush_pchain(struct lfs *fs)
+{
+	struct inode *ip, *nip;
+	struct vnode *vp;
+	extern int lfs_dostats;
+	struct segment *sp;
+	int error;
+
+	ASSERT_NO_SEGLOCK(fs);
+
+	if (fs->lfs_ronly)
+		return;
+
+	mutex_enter(&lfs_lock);
+	if (TAILQ_FIRST(&fs->lfs_pchainhd) == NULL) {
+		mutex_exit(&lfs_lock);
+		return;
+	} else
+		mutex_exit(&lfs_lock);
+
+	/* Get dirops out of the way */
+	lfs_flush_dirops(fs);
+
+	if (lfs_dostats)
+		++lfs_stats.flush_invoked;
+
+	/*
+	 * Inline lfs_segwrite/lfs_writevnodes, but just for pageouts.
+	 */
+	lfs_imtime(fs);
+	lfs_seglock(fs, 0);
+	sp = fs->lfs_sp;
+
+	/*
+	 * lfs_writevnodes, optimized to clear pageout requests.
+	 * Only write non-dirop files that are in the pageout queue.
+	 * We're very conservative about what we write; we want to be
+	 * fast and async.
+	 */
+	mutex_enter(&lfs_lock);
+    top:
+	for (ip = TAILQ_FIRST(&fs->lfs_pchainhd); ip != NULL; ip = nip) {
+		nip = TAILQ_NEXT(ip, i_lfs_pchain);
+		vp = ITOV(ip);
+
+		if (!(ip->i_flags & IN_PAGING))
+			goto top;
+
+		mutex_enter(vp->v_interlock);
+		if ((vp->v_iflag & VI_XLOCK) || (vp->v_uflag & VU_DIROP) != 0) {
+			mutex_exit(vp->v_interlock);
+			continue;
+		}
+		if (vp->v_type != VREG) {
+			mutex_exit(vp->v_interlock);
+			continue;
+		}
+		if (lfs_vref(vp))
+			continue;
+		mutex_exit(&lfs_lock);
+
+		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_RETRY) != 0) {
+			lfs_vunref(vp);
+			mutex_enter(&lfs_lock);
+			continue;
+		}
+
+		error = lfs_writefile(fs, sp, vp);
+		if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
+		    !(ip->i_flag & IN_ALLMOD)) {
+		    	mutex_enter(&lfs_lock);
+			LFS_SET_UINO(ip, IN_MODIFIED);
+		    	mutex_exit(&lfs_lock);
+		}
+		KDASSERT(ip->i_number != LFS_IFILE_INUM);
+		(void) lfs_writeinode(fs, sp, ip);
+
+		VOP_UNLOCK(vp);
+		lfs_vunref(vp);
+
+		if (error == EAGAIN) {
+			lfs_writeseg(fs, sp);
+			mutex_enter(&lfs_lock);
+			break;
+		}
+		mutex_enter(&lfs_lock);
+	}
+	mutex_exit(&lfs_lock);
+	(void) lfs_writeseg(fs, sp);
+	lfs_segunlock(fs);
+}
+
+/*
+ * Provide a fcntl interface to sys_lfs_{segwait,bmapv,markv}.
+ */
+int
+lfs_fcntl(void *v)
+{
+	struct vop_fcntl_args /* {
+		struct vnode *a_vp;
+		u_int a_command;
+		void * a_data;
+		int  a_fflag;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+	struct timeval tv;
+	struct timeval *tvp;
+	BLOCK_INFO *blkiov;
+	CLEANERINFO *cip;
+	SEGUSE *sup;
+	int blkcnt, error, oclean;
+	size_t fh_size;
+	struct lfs_fcntl_markv blkvp;
+	struct lwp *l;
+	fsid_t *fsidp;
+	struct lfs *fs;
+	struct buf *bp;
+	fhandle_t *fhp;
+	daddr_t off;
+
+	/* Only respect LFS fcntls on fs root or Ifile */
+	if (VTOI(ap->a_vp)->i_number != ROOTINO &&
+	    VTOI(ap->a_vp)->i_number != LFS_IFILE_INUM) {
+		return ufs_fcntl(v);
+	}
+
+	/* Avoid locking a draining lock */
+	if (ap->a_vp->v_mount->mnt_iflag & IMNT_UNMOUNT) {
+		return ESHUTDOWN;
+	}
+
+	/* LFS control and monitoring fcntls are available only to root */
+	l = curlwp;
+	if (((ap->a_command & 0xff00) >> 8) == 'L' &&
+	    (error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+					     NULL)) != 0)
+		return (error);
+
+	fs = VTOI(ap->a_vp)->i_lfs;
+	fsidp = &ap->a_vp->v_mount->mnt_stat.f_fsidx;
+
+	error = 0;
+	switch ((int)ap->a_command) {
+	    case LFCNSEGWAITALL_COMPAT_50:
+	    case LFCNSEGWAITALL_COMPAT:
+		fsidp = NULL;
+		/* FALLSTHROUGH */
+	    case LFCNSEGWAIT_COMPAT_50:
+	    case LFCNSEGWAIT_COMPAT:
+		{
+			struct timeval50 *tvp50
+				= (struct timeval50 *)ap->a_data;
+			timeval50_to_timeval(tvp50, &tv);
+			tvp = &tv;
+		}
+		goto segwait_common;
+	    case LFCNSEGWAITALL:
+		fsidp = NULL;
+		/* FALLSTHROUGH */
+	    case LFCNSEGWAIT:
+		tvp = (struct timeval *)ap->a_data;
+segwait_common:
+		mutex_enter(&lfs_lock);
+		++fs->lfs_sleepers;
+		mutex_exit(&lfs_lock);
+
+		error = lfs_segwait(fsidp, tvp);
+
+		mutex_enter(&lfs_lock);
+		if (--fs->lfs_sleepers == 0)
+			wakeup(&fs->lfs_sleepers);
+		mutex_exit(&lfs_lock);
+		return error;
+
+	    case LFCNBMAPV:
+	    case LFCNMARKV:
+		blkvp = *(struct lfs_fcntl_markv *)ap->a_data;
+
+		blkcnt = blkvp.blkcnt;
+		if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
+			return (EINVAL);
+		blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+		if ((error = copyin(blkvp.blkiov, blkiov,
+		     blkcnt * sizeof(BLOCK_INFO))) != 0) {
+			lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+			return error;
+		}
+
+		mutex_enter(&lfs_lock);
+		++fs->lfs_sleepers;
+		mutex_exit(&lfs_lock);
+		if (ap->a_command == LFCNBMAPV)
+			error = lfs_bmapv(l->l_proc, fsidp, blkiov, blkcnt);
+		else /* LFCNMARKV */
+			error = lfs_markv(l->l_proc, fsidp, blkiov, blkcnt);
+		if (error == 0)
+			error = copyout(blkiov, blkvp.blkiov,
+					blkcnt * sizeof(BLOCK_INFO));
+		mutex_enter(&lfs_lock);
+		if (--fs->lfs_sleepers == 0)
+			wakeup(&fs->lfs_sleepers);
+		mutex_exit(&lfs_lock);
+		lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+		return error;
+
+	    case LFCNRECLAIM:
+		/*
+		 * Flush dirops and write Ifile, allowing empty segments
+		 * to be immediately reclaimed.
+		 */
+		lfs_writer_enter(fs, "pndirop");
+		off = fs->lfs_offset;
+		lfs_seglock(fs, SEGM_FORCE_CKP | SEGM_CKP);
+		lfs_flush_dirops(fs);
+		LFS_CLEANERINFO(cip, fs, bp);
+		oclean = cip->clean;
+		LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
+		lfs_segwrite(ap->a_vp->v_mount, SEGM_FORCE_CKP);
+		fs->lfs_sp->seg_flags |= SEGM_PROT;
+		lfs_segunlock(fs);
+		lfs_writer_leave(fs);
+
+#ifdef DEBUG
+		LFS_CLEANERINFO(cip, fs, bp);
+		DLOG((DLOG_CLEAN, "lfs_fcntl: reclaim wrote %" PRId64
+		      " blocks, cleaned %" PRId32 " segments (activesb %d)\n",
+		      fs->lfs_offset - off, cip->clean - oclean,
+		      fs->lfs_activesb));
+		LFS_SYNC_CLEANERINFO(cip, fs, bp, 0);
+#endif
+
+		return 0;
+
+	    case LFCNIFILEFH_COMPAT:
+		/* Return the filehandle of the Ifile */
+		if ((error = kauth_authorize_system(l->l_cred,
+		    KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL)) != 0)
+			return (error);
+		fhp = (struct fhandle *)ap->a_data;
+		fhp->fh_fsid = *fsidp;
+		fh_size = 16;	/* former VFS_MAXFIDSIZ */
+		return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size);
+
+	    case LFCNIFILEFH_COMPAT2:
+	    case LFCNIFILEFH:
+		/* Return the filehandle of the Ifile */
+		fhp = (struct fhandle *)ap->a_data;
+		fhp->fh_fsid = *fsidp;
+		fh_size = sizeof(struct lfs_fhandle) -
+		    offsetof(fhandle_t, fh_fid);
+		return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size);
+
+	    case LFCNREWIND:
+		/* Move lfs_offset to the lowest-numbered segment */
+		return lfs_rewind(fs, *(int *)ap->a_data);
+
+	    case LFCNINVAL:
+		/* Mark a segment SEGUSE_INVAL */
+		LFS_SEGENTRY(sup, fs, *(int *)ap->a_data, bp);
+		if (sup->su_nbytes > 0) {
+			brelse(bp, 0);
+			lfs_unset_inval_all(fs);
+			return EBUSY;
+		}
+		sup->su_flags |= SEGUSE_INVAL;
+		VOP_BWRITE(bp->b_vp, bp);
+		return 0;
+
+	    case LFCNRESIZE:
+		/* Resize the filesystem */
+		return lfs_resize_fs(fs, *(int *)ap->a_data);
+
+	    case LFCNWRAPSTOP:
+	    case LFCNWRAPSTOP_COMPAT:
+		/*
+		 * Hold lfs_newseg at segment 0; if requested, sleep until
+		 * the filesystem wraps around.  To support external agents
+		 * (dump, fsck-based regression test) that need to look at
+		 * a snapshot of the filesystem, without necessarily
+		 * requiring that all fs activity stops.
+		 */
+		if (fs->lfs_stoplwp == curlwp)
+			return EALREADY;
+
+		mutex_enter(&lfs_lock);
+		while (fs->lfs_stoplwp != NULL)
+			cv_wait(&fs->lfs_stopcv, &lfs_lock);
+		fs->lfs_stoplwp = curlwp;
+		if (fs->lfs_nowrap == 0)
+			log(LOG_NOTICE, "%s: disabled log wrap\n", fs->lfs_fsmnt);
+		++fs->lfs_nowrap;
+		if (*(int *)ap->a_data == 1
+		    || ap->a_command == LFCNWRAPSTOP_COMPAT) {
+			log(LOG_NOTICE, "LFCNSTOPWRAP waiting for log wrap\n");
+			error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER,
+				"segwrap", 0, &lfs_lock);
+			log(LOG_NOTICE, "LFCNSTOPWRAP done waiting\n");
+			if (error) {
+				lfs_wrapgo(fs, VTOI(ap->a_vp), 0);
+			}
+		}
+		mutex_exit(&lfs_lock);
+		return 0;
+
+	    case LFCNWRAPGO:
+	    case LFCNWRAPGO_COMPAT:
+		/*
+		 * Having done its work, the agent wakes up the writer.
+		 * If the argument is 1, it sleeps until a new segment
+		 * is selected.
+		 */
+		mutex_enter(&lfs_lock);
+		error = lfs_wrapgo(fs, VTOI(ap->a_vp),
+				   ap->a_command == LFCNWRAPGO_COMPAT ? 1 :
+				    *((int *)ap->a_data));
+		mutex_exit(&lfs_lock);
+		return error;
+
+	    case LFCNWRAPPASS:
+		if ((VTOI(ap->a_vp)->i_lfs_iflags & LFSI_WRAPWAIT))
+			return EALREADY;
+		mutex_enter(&lfs_lock);
+		if (fs->lfs_stoplwp != curlwp) {
+			mutex_exit(&lfs_lock);
+			return EALREADY;
+		}
+		if (fs->lfs_nowrap == 0) {
+			mutex_exit(&lfs_lock);
+			return EBUSY;
+		}
+		fs->lfs_wrappass = 1;
+		wakeup(&fs->lfs_wrappass);
+		/* Wait for the log to wrap, if asked */
+		if (*(int *)ap->a_data) {
+			mutex_enter(ap->a_vp->v_interlock);
+			lfs_vref(ap->a_vp);
+			VTOI(ap->a_vp)->i_lfs_iflags |= LFSI_WRAPWAIT;
+			log(LOG_NOTICE, "LFCNPASS waiting for log wrap\n");
+			error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER,
+				"segwrap", 0, &lfs_lock);
+			log(LOG_NOTICE, "LFCNPASS done waiting\n");
+			VTOI(ap->a_vp)->i_lfs_iflags &= ~LFSI_WRAPWAIT;
+			lfs_vunref(ap->a_vp);
+		}
+		mutex_exit(&lfs_lock);
+		return error;
+
+	    case LFCNWRAPSTATUS:
+		mutex_enter(&lfs_lock);
+		*(int *)ap->a_data = fs->lfs_wrapstatus;
+		mutex_exit(&lfs_lock);
+		return 0;
+
+	    default:
+		return ufs_fcntl(v);
+	}
+	return 0;
+}
+
+int
+lfs_getpages(void *v)
+{
+	struct vop_getpages_args /* {
+		struct vnode *a_vp;
+		voff_t a_offset;
+		struct vm_page **a_m;
+		int *a_count;
+		int a_centeridx;
+		vm_prot_t a_access_type;
+		int a_advice;
+		int a_flags;
+	} */ *ap = v;
+
+	if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM &&
+	    (ap->a_access_type & VM_PROT_WRITE) != 0) {
+		return EPERM;
+	}
+	if ((ap->a_access_type & VM_PROT_WRITE) != 0) {
+		mutex_enter(&lfs_lock);
+		LFS_SET_UINO(VTOI(ap->a_vp), IN_MODIFIED);
+		mutex_exit(&lfs_lock);
+	}
+
+	/*
+	 * we're relying on the fact that genfs_getpages() always read in
+	 * entire filesystem blocks.
+	 */
+	return genfs_getpages(v);
+}
+
+/*
+ * Wait for a page to become unbusy, possibly printing diagnostic messages
+ * as well.
+ *
+ * Called with vp->v_interlock held; return with it held.
+ */
+static void
+wait_for_page(struct vnode *vp, struct vm_page *pg, const char *label)
+{
+	if ((pg->flags & PG_BUSY) == 0)
+		return;		/* Nothing to wait for! */
+
+#if defined(DEBUG) && defined(UVM_PAGE_TRKOWN)
+	static struct vm_page *lastpg;
+
+	if (label != NULL && pg != lastpg) {
+		if (pg->owner_tag) {
+			printf("lfs_putpages[%d.%d]: %s: page %p owner %d.%d [%s]\n",
+			       curproc->p_pid, curlwp->l_lid, label,
+			       pg, pg->owner, pg->lowner, pg->owner_tag);
+		} else {
+			printf("lfs_putpages[%d.%d]: %s: page %p unowned?!\n",
+			       curproc->p_pid, curlwp->l_lid, label, pg);
+		}
+	}
+	lastpg = pg;
+#endif
+
+	pg->flags |= PG_WANTED;
+	UVM_UNLOCK_AND_WAIT(pg, vp->v_interlock, 0, "lfsput", 0);
+	mutex_enter(vp->v_interlock);
+}
+
+/*
+ * This routine is called by lfs_putpages() when it can't complete the
+ * write because a page is busy.  This means that either (1) someone,
+ * possibly the pagedaemon, is looking at this page, and will give it up
+ * presently; or (2) we ourselves are holding the page busy in the
+ * process of being written (either gathered or actually on its way to
+ * disk).  We don't need to give up the segment lock, but we might need
+ * to call lfs_writeseg() to expedite the page's journey to disk.
+ *
+ * Called with vp->v_interlock held; return with it held.
+ */
+/* #define BUSYWAIT */
+static void
+write_and_wait(struct lfs *fs, struct vnode *vp, struct vm_page *pg,
+	       int seglocked, const char *label)
+{
+#ifndef BUSYWAIT
+	struct inode *ip = VTOI(vp);
+	struct segment *sp = fs->lfs_sp;
+	int count = 0;
+
+	if (pg == NULL)
+		return;
+
+	while (pg->flags & PG_BUSY &&
+	    pg->uobject == &vp->v_uobj) {
+		mutex_exit(vp->v_interlock);
+		if (sp->cbpp - sp->bpp > 1) {
+			/* Write gathered pages */
+			lfs_updatemeta(sp);
+			lfs_release_finfo(fs);
+			(void) lfs_writeseg(fs, sp);
+
+			/*
+			 * Reinitialize FIP
+			 */
+			KASSERT(sp->vp == vp);
+			lfs_acquire_finfo(fs, ip->i_number,
+					  ip->i_gen);
+		}
+		++count;
+		mutex_enter(vp->v_interlock);
+		wait_for_page(vp, pg, label);
+	}
+	if (label != NULL && count > 1)
+		printf("lfs_putpages[%d]: %s: %sn = %d\n", curproc->p_pid,
+		       label, (count > 0 ? "looping, " : ""), count);
+#else
+	preempt(1);
+#endif
+}
+
+/*
+ * Make sure that for all pages in every block in the given range,
+ * either all are dirty or all are clean.  If any of the pages
+ * we've seen so far are dirty, put the vnode on the paging chain,
+ * and mark it IN_PAGING.
+ *
+ * If checkfirst != 0, don't check all the pages but return at the
+ * first dirty page.
+ */
+static int
+check_dirty(struct lfs *fs, struct vnode *vp,
+	    off_t startoffset, off_t endoffset, off_t blkeof,
+	    int flags, int checkfirst, struct vm_page **pgp)
+{
+	int by_list;
+	struct vm_page *curpg = NULL; /* XXX: gcc */
+	struct vm_page *pgs[MAXBSIZE / PAGE_SIZE], *pg;
+	off_t soff = 0; /* XXX: gcc */
+	voff_t off;
+	int i;
+	int nonexistent;
+	int any_dirty;	/* number of dirty pages */
+	int dirty;	/* number of dirty pages in a block */
+	int tdirty;
+	int pages_per_block = fs->lfs_bsize >> PAGE_SHIFT;
+	int pagedaemon = (curlwp == uvm.pagedaemon_lwp);
+
+	ASSERT_MAYBE_SEGLOCK(fs);
+  top:
+	by_list = (vp->v_uobj.uo_npages <=
+		   ((endoffset - startoffset) >> PAGE_SHIFT) *
+		   UVM_PAGE_TREE_PENALTY);
+	any_dirty = 0;
+
+	if (by_list) {
+		curpg = TAILQ_FIRST(&vp->v_uobj.memq);
+	} else {
+		soff = startoffset;
+	}
+	while (by_list || soff < MIN(blkeof, endoffset)) {
+		if (by_list) {
+			/*
+			 * Find the first page in a block.  Skip
+			 * blocks outside our area of interest or beyond
+			 * the end of file.
+			 */
+			KASSERT(curpg == NULL
+			    || (curpg->flags & PG_MARKER) == 0);
+			if (pages_per_block > 1) {
+				while (curpg &&
+				    ((curpg->offset & fs->lfs_bmask) ||
+				    curpg->offset >= vp->v_size ||
+				    curpg->offset >= endoffset)) {
+					curpg = TAILQ_NEXT(curpg, listq.queue);
+					KASSERT(curpg == NULL ||
+					    (curpg->flags & PG_MARKER) == 0);
+				}
+			}
+			if (curpg == NULL)
+				break;
+			soff = curpg->offset;
+		}
+
+		/*
+		 * Mark all pages in extended range busy; find out if any
+		 * of them are dirty.
+		 */
+		nonexistent = dirty = 0;
+		for (i = 0; i == 0 || i < pages_per_block; i++) {
+			if (by_list && pages_per_block <= 1) {
+				pgs[i] = pg = curpg;
+			} else {
+				off = soff + (i << PAGE_SHIFT);
+				pgs[i] = pg = uvm_pagelookup(&vp->v_uobj, off);
+				if (pg == NULL) {
+					++nonexistent;
+					continue;
+				}
+			}
+			KASSERT(pg != NULL);
+
+			/*
+			 * If we're holding the segment lock, we can deadlock
+			 * against a process that has our page and is waiting
+			 * for the cleaner, while the cleaner waits for the
+			 * segment lock.  Just bail in that case.
+			 */
+			if ((pg->flags & PG_BUSY) &&
+			    (pagedaemon || LFS_SEGLOCK_HELD(fs))) {
+				if (i > 0)
+					uvm_page_unbusy(pgs, i);
+				DLOG((DLOG_PAGE, "lfs_putpages: avoiding 3-way or pagedaemon deadlock\n"));
+				if (pgp)
+					*pgp = pg;
+				return -1;
+			}
+
+			while (pg->flags & PG_BUSY) {
+				wait_for_page(vp, pg, NULL);
+				if (i > 0)
+					uvm_page_unbusy(pgs, i);
+				goto top;
+			}
+			pg->flags |= PG_BUSY;
+			UVM_PAGE_OWN(pg, "lfs_putpages");
+
+			pmap_page_protect(pg, VM_PROT_NONE);
+			tdirty = (pmap_clear_modify(pg) ||
+				  (pg->flags & PG_CLEAN) == 0);
+			dirty += tdirty;
+		}
+		if (pages_per_block > 0 && nonexistent >= pages_per_block) {
+			if (by_list) {
+				curpg = TAILQ_NEXT(curpg, listq.queue);
+			} else {
+				soff += fs->lfs_bsize;
+			}
+			continue;
+		}
+
+		any_dirty += dirty;
+		KASSERT(nonexistent == 0);
+
+		/*
+		 * If any are dirty make all dirty; unbusy them,
+		 * but if we were asked to clean, wire them so that
+		 * the pagedaemon doesn't bother us about them while
+		 * they're on their way to disk.
+		 */
+		for (i = 0; i == 0 || i < pages_per_block; i++) {
+			pg = pgs[i];
+			KASSERT(!((pg->flags & PG_CLEAN) && (pg->flags & PG_DELWRI)));
+			if (dirty) {
+				pg->flags &= ~PG_CLEAN;
+				if (flags & PGO_FREE) {
+					/*
+					 * Wire the page so that
+					 * pdaemon doesn't see it again.
+					 */
+					mutex_enter(&uvm_pageqlock);
+					uvm_pagewire(pg);
+					mutex_exit(&uvm_pageqlock);
+
+					/* Suspended write flag */
+					pg->flags |= PG_DELWRI;
+				}
+			}
+			if (pg->flags & PG_WANTED)
+				wakeup(pg);
+			pg->flags &= ~(PG_WANTED|PG_BUSY);
+			UVM_PAGE_OWN(pg, NULL);
+		}
+
+		if (checkfirst && any_dirty)
+			break;
+
+		if (by_list) {
+			curpg = TAILQ_NEXT(curpg, listq.queue);
+		} else {
+			soff += MAX(PAGE_SIZE, fs->lfs_bsize);
+		}
+	}
+
+	return any_dirty;
+}
+
+/*
+ * lfs_putpages functions like genfs_putpages except that
+ *
+ * (1) It needs to bounds-check the incoming requests to ensure that
+ *     they are block-aligned; if they are not, expand the range and
+ *     do the right thing in case, e.g., the requested range is clean
+ *     but the expanded range is dirty.
+ *
+ * (2) It needs to explicitly send blocks to be written when it is done.
+ *     If VOP_PUTPAGES is called without the seglock held, we simply take
+ *     the seglock and let lfs_segunlock wait for us.
+ *     XXX There might be a bad situation if we have to flush a vnode while
+ *     XXX lfs_markv is in operation.  As of this writing we panic in this
+ *     XXX case.
+ *
+ * Assumptions:
+ *
+ * (1) The caller does not hold any pages in this vnode busy.  If it does,
+ *     there is a danger that when we expand the page range and busy the
+ *     pages we will deadlock.
+ *
+ * (2) We are called with vp->v_interlock held; we must return with it
+ *     released.
+ *
+ * (3) We don't absolutely have to free pages right away, provided that
+ *     the request does not have PGO_SYNCIO.  When the pagedaemon gives
+ *     us a request with PGO_FREE, we take the pages out of the paging
+ *     queue and wake up the writer, which will handle freeing them for us.
+ *
+ *     We ensure that for any filesystem block, all pages for that
+ *     block are either resident or not, even if those pages are higher
+ *     than EOF; that means that we will be getting requests to free
+ *     "unused" pages above EOF all the time, and should ignore them.
+ *
+ * (4) If we are called with PGO_LOCKED, the finfo array we are to write
+ *     into has been set up for us by lfs_writefile.  If not, we will
+ *     have to handle allocating and/or freeing an finfo entry.
+ *
+ * XXX note that we're (ab)using PGO_LOCKED as "seglock held".
+ */
+
+/* How many times to loop before we should start to worry */
+#define TOOMANY 4
+
+int
+lfs_putpages(void *v)
+{
+	int error;
+	struct vop_putpages_args /* {
+		struct vnode *a_vp;
+		voff_t a_offlo;
+		voff_t a_offhi;
+		int a_flags;
+	} */ *ap = v;
+	struct vnode *vp;
+	struct inode *ip;
+	struct lfs *fs;
+	struct segment *sp;
+	off_t origoffset, startoffset, endoffset, origendoffset, blkeof;
+	off_t off, max_endoffset;
+	bool seglocked, sync, pagedaemon;
+	struct vm_page *pg, *busypg;
+	UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist);
+#ifdef DEBUG
+	int debug_n_again, debug_n_dirtyclean;
+#endif
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	fs = ip->i_lfs;
+	sync = (ap->a_flags & PGO_SYNCIO) != 0;
+	pagedaemon = (curlwp == uvm.pagedaemon_lwp);
+
+	/* Putpages does nothing for metadata. */
+	if (vp == fs->lfs_ivnode || vp->v_type != VREG) {
+		mutex_exit(vp->v_interlock);
+		return 0;
+	}
+
+	/*
+	 * If there are no pages, don't do anything.
+	 */
+	if (vp->v_uobj.uo_npages == 0) {
+		if (TAILQ_EMPTY(&vp->v_uobj.memq) &&
+		    (vp->v_iflag & VI_ONWORKLST) &&
+		    LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
+			vp->v_iflag &= ~VI_WRMAPDIRTY;
+			vn_syncer_remove_from_worklist(vp);
+		}
+		mutex_exit(vp->v_interlock);
+		
+		/* Remove us from paging queue, if we were on it */
+		mutex_enter(&lfs_lock);
+		if (ip->i_flags & IN_PAGING) {
+			ip->i_flags &= ~IN_PAGING;
+			TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
+		}
+		mutex_exit(&lfs_lock);
+		return 0;
+	}
+
+	blkeof = blkroundup(fs, ip->i_size);
+
+	/*
+	 * Ignore requests to free pages past EOF but in the same block
+	 * as EOF, unless the request is synchronous.  (If the request is
+	 * sync, it comes from lfs_truncate.)
+	 * XXXUBC Make these pages look "active" so the pagedaemon won't
+	 * XXXUBC bother us with them again.
+	 */
+	if (!sync && ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) {
+		origoffset = ap->a_offlo;
+		for (off = origoffset; off < blkeof; off += fs->lfs_bsize) {
+			pg = uvm_pagelookup(&vp->v_uobj, off);
+			KASSERT(pg != NULL);
+			while (pg->flags & PG_BUSY) {
+				pg->flags |= PG_WANTED;
+				UVM_UNLOCK_AND_WAIT(pg, vp->v_interlock, 0,
+						    "lfsput2", 0);
+				mutex_enter(vp->v_interlock);
+			}
+			mutex_enter(&uvm_pageqlock);
+			uvm_pageactivate(pg);
+			mutex_exit(&uvm_pageqlock);
+		}
+		ap->a_offlo = blkeof;
+		if (ap->a_offhi > 0 && ap->a_offhi <= ap->a_offlo) {
+			mutex_exit(vp->v_interlock);
+			return 0;
+		}
+	}
+
+	/*
+	 * Extend page range to start and end at block boundaries.
+	 * (For the purposes of VOP_PUTPAGES, fragments don't exist.)
+	 */
+	origoffset = ap->a_offlo;
+	origendoffset = ap->a_offhi;
+	startoffset = origoffset & ~(fs->lfs_bmask);
+	max_endoffset = (trunc_page(LLONG_MAX) >> fs->lfs_bshift)
+					       << fs->lfs_bshift;
+
+	if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) {
+		endoffset = max_endoffset;
+		origendoffset = endoffset;
+	} else {
+		origendoffset = round_page(ap->a_offhi);
+		endoffset = round_page(blkroundup(fs, origendoffset));
+	}
+
+	KASSERT(startoffset > 0 || endoffset >= startoffset);
+	if (startoffset == endoffset) {
+		/* Nothing to do, why were we called? */
+		mutex_exit(vp->v_interlock);
+		DLOG((DLOG_PAGE, "lfs_putpages: startoffset = endoffset = %"
+		      PRId64 "\n", startoffset));
+		return 0;
+	}
+
+	ap->a_offlo = startoffset;
+	ap->a_offhi = endoffset;
+
+	/*
+	 * If not cleaning, just send the pages through genfs_putpages
+	 * to be returned to the pool.
+	 */
+	if (!(ap->a_flags & PGO_CLEANIT))
+		return genfs_putpages(v);
+
+	/* Set PGO_BUSYFAIL to avoid deadlocks */
+	ap->a_flags |= PGO_BUSYFAIL;
+
+	/*
+	 * Likewise, if we are asked to clean but the pages are not
+	 * dirty, we can just free them using genfs_putpages.
+	 */
+#ifdef DEBUG
+	debug_n_dirtyclean = 0;
+#endif
+	do {
+		int r;
+
+		/* Count the number of dirty pages */
+		r = check_dirty(fs, vp, startoffset, endoffset, blkeof,
+				ap->a_flags, 1, NULL);
+		if (r < 0) {
+			/* Pages are busy with another process */
+			mutex_exit(vp->v_interlock);
+			return EDEADLK;
+		}
+		if (r > 0) /* Some pages are dirty */
+			break;
+
+		/*
+		 * Sometimes pages are dirtied between the time that
+		 * we check and the time we try to clean them.
+		 * Instruct lfs_gop_write to return EDEADLK in this case
+		 * so we can write them properly.
+		 */
+		ip->i_lfs_iflags |= LFSI_NO_GOP_WRITE;
+		r = genfs_do_putpages(vp, startoffset, endoffset,
+				       ap->a_flags & ~PGO_SYNCIO, &busypg);
+		ip->i_lfs_iflags &= ~LFSI_NO_GOP_WRITE;
+		if (r != EDEADLK)
+			return r;
+
+		/* One of the pages was busy.  Start over. */
+		mutex_enter(vp->v_interlock);
+		wait_for_page(vp, busypg, "dirtyclean");
+#ifdef DEBUG
+		++debug_n_dirtyclean;
+#endif
+	} while(1);
+
+#ifdef DEBUG
+	if (debug_n_dirtyclean > TOOMANY)
+		printf("lfs_putpages: dirtyclean: looping, n = %d\n",
+		       debug_n_dirtyclean);
+#endif
+
+	/*
+	 * Dirty and asked to clean.
+	 *
+	 * Pagedaemon can't actually write LFS pages; wake up
+	 * the writer to take care of that.  The writer will
+	 * notice the pager inode queue and act on that.
+	 *
+	 * XXX We must drop the vp->interlock before taking the lfs_lock or we
+	 * get a nasty deadlock with lfs_flush_pchain().
+	 */
+	if (pagedaemon) {
+		mutex_exit(vp->v_interlock);
+		mutex_enter(&lfs_lock);
+		if (!(ip->i_flags & IN_PAGING)) {
+			ip->i_flags |= IN_PAGING;
+			TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip, i_lfs_pchain);
+		} 
+		wakeup(&lfs_writer_daemon);
+		mutex_exit(&lfs_lock);
+		preempt();
+		return EWOULDBLOCK;
+	}
+
+	/*
+	 * If this is a file created in a recent dirop, we can't flush its
+	 * inode until the dirop is complete.  Drain dirops, then flush the
+	 * filesystem (taking care of any other pending dirops while we're
+	 * at it).
+	 */
+	if ((ap->a_flags & (PGO_CLEANIT|PGO_LOCKED)) == PGO_CLEANIT &&
+	    (vp->v_uflag & VU_DIROP)) {
+		int locked;
+
+		DLOG((DLOG_PAGE, "lfs_putpages: flushing VU_DIROP\n"));
+		/* XXX VOP_ISLOCKED() may not be used for lock decisions. */
+		locked = (VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+		mutex_exit(vp->v_interlock);
+		lfs_writer_enter(fs, "ppdirop");
+		if (locked)
+			VOP_UNLOCK(vp); /* XXX why? */
+
+		mutex_enter(&lfs_lock);
+		lfs_flush_fs(fs, sync ? SEGM_SYNC : 0);
+		mutex_exit(&lfs_lock);
+
+		if (locked)
+			VOP_LOCK(vp, LK_EXCLUSIVE);
+		mutex_enter(vp->v_interlock);
+		lfs_writer_leave(fs);
+
+		/* XXX the flush should have taken care of this one too! */
+	}
+
+	/*
+	 * This is it.	We are going to write some pages.  From here on
+	 * down it's all just mechanics.
+	 *
+	 * Don't let genfs_putpages wait; lfs_segunlock will wait for us.
+	 */
+	ap->a_flags &= ~PGO_SYNCIO;
+
+	/*
+	 * If we've already got the seglock, flush the node and return.
+	 * The FIP has already been set up for us by lfs_writefile,
+	 * and FIP cleanup and lfs_updatemeta will also be done there,
+	 * unless genfs_putpages returns EDEADLK; then we must flush
+	 * what we have, and correct FIP and segment header accounting.
+	 */
+  get_seglock:
+	/*
+	 * If we are not called with the segment locked, lock it.
+	 * Account for a new FIP in the segment header, and set sp->vp.
+	 * (This should duplicate the setup at the top of lfs_writefile().)
+	 */
+	seglocked = (ap->a_flags & PGO_LOCKED) != 0;
+	if (!seglocked) {
+		mutex_exit(vp->v_interlock);
+		error = lfs_seglock(fs, SEGM_PROT | (sync ? SEGM_SYNC : 0));
+		if (error != 0)
+			return error;
+		mutex_enter(vp->v_interlock);
+		lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
+	}
+	sp = fs->lfs_sp;
+	KASSERT(sp->vp == NULL);
+	sp->vp = vp;
+
+	/*
+	 * Ensure that the partial segment is marked SS_DIROP if this
+	 * vnode is a DIROP.
+	 */
+	if (!seglocked && vp->v_uflag & VU_DIROP)
+		((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
+
+	/*
+	 * Loop over genfs_putpages until all pages are gathered.
+	 * genfs_putpages() drops the interlock, so reacquire it if necessary.
+	 * Whenever we lose the interlock we have to rerun check_dirty, as
+	 * well, since more pages might have been dirtied in our absence.
+	 */
+#ifdef DEBUG
+	debug_n_again = 0;
+#endif
+	do {
+		busypg = NULL;
+		if (check_dirty(fs, vp, startoffset, endoffset, blkeof,
+				ap->a_flags, 0, &busypg) < 0) {
+			mutex_exit(vp->v_interlock);
+
+			mutex_enter(vp->v_interlock);
+			write_and_wait(fs, vp, busypg, seglocked, NULL);
+			if (!seglocked) {
+				mutex_exit(vp->v_interlock);
+				lfs_release_finfo(fs);
+				lfs_segunlock(fs);
+				mutex_enter(vp->v_interlock);
+			}
+			sp->vp = NULL;
+			goto get_seglock;
+		}
+	
+		busypg = NULL;
+		error = genfs_do_putpages(vp, startoffset, endoffset,
+					   ap->a_flags, &busypg);
+	
+		if (error == EDEADLK || error == EAGAIN) {
+			DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned"
+			      " %d ino %d off %x (seg %d)\n", error,
+			      ip->i_number, fs->lfs_offset,
+			      dtosn(fs, fs->lfs_offset)));
+
+			mutex_enter(vp->v_interlock);
+			write_and_wait(fs, vp, busypg, seglocked, "again");
+		}
+#ifdef DEBUG
+		++debug_n_again;
+#endif
+	} while (error == EDEADLK);
+#ifdef DEBUG
+	if (debug_n_again > TOOMANY)
+		printf("lfs_putpages: again: looping, n = %d\n", debug_n_again);
+#endif
+
+	KASSERT(sp != NULL && sp->vp == vp);
+	if (!seglocked) {
+		sp->vp = NULL;
+
+		/* Write indirect blocks as well */
+		lfs_gather(fs, fs->lfs_sp, vp, lfs_match_indir);
+		lfs_gather(fs, fs->lfs_sp, vp, lfs_match_dindir);
+		lfs_gather(fs, fs->lfs_sp, vp, lfs_match_tindir);
+
+		KASSERT(sp->vp == NULL);
+		sp->vp = vp;
+	}
+
+	/*
+	 * Blocks are now gathered into a segment waiting to be written.
+	 * All that's left to do is update metadata, and write them.
+	 */
+	lfs_updatemeta(sp);
+	KASSERT(sp->vp == vp);
+	sp->vp = NULL;
+
+	/*
+	 * If we were called from lfs_writefile, we don't need to clean up
+	 * the FIP or unlock the segment lock.	We're done.
+	 */
+	if (seglocked)
+		return error;
+
+	/* Clean up FIP and send it to disk. */
+	lfs_release_finfo(fs);
+	lfs_writeseg(fs, fs->lfs_sp);
+
+	/*
+	 * Remove us from paging queue if we wrote all our pages.
+	 */
+	if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) {
+		mutex_enter(&lfs_lock);
+		if (ip->i_flags & IN_PAGING) {
+			ip->i_flags &= ~IN_PAGING;
+			TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
+		}
+		mutex_exit(&lfs_lock);
+	}
+
+	/*
+	 * XXX - with the malloc/copy writeseg, the pages are freed by now
+	 * even if we don't wait (e.g. if we hold a nested lock).  This
+	 * will not be true if we stop using malloc/copy.
+	 */
+	KASSERT(fs->lfs_sp->seg_flags & SEGM_PROT);
+	lfs_segunlock(fs);
+
+	/*
+	 * Wait for v_numoutput to drop to zero.  The seglock should
+	 * take care of this, but there is a slight possibility that
+	 * aiodoned might not have got around to our buffers yet.
+	 */
+	if (sync) {
+		mutex_enter(vp->v_interlock);
+		while (vp->v_numoutput > 0) {
+			DLOG((DLOG_PAGE, "lfs_putpages: ino %d sleeping on"
+			      " num %d\n", ip->i_number, vp->v_numoutput));
+			cv_wait(&vp->v_cv, vp->v_interlock);
+		}
+		mutex_exit(vp->v_interlock);
+	}
+	return error;
+}
+
+/*
+ * Return the last logical file offset that should be written for this file
+ * if we're doing a write that ends at "size".	If writing, we need to know
+ * about sizes on disk, i.e. fragments if there are any; if reading, we need
+ * to know about entire blocks.
+ */
+void
+lfs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
+{
+	struct inode *ip = VTOI(vp);
+	struct lfs *fs = ip->i_lfs;
+	daddr_t olbn, nlbn;
+
+	olbn = lblkno(fs, ip->i_size);
+	nlbn = lblkno(fs, size);
+	if (!(flags & GOP_SIZE_MEM) && nlbn < NDADDR && olbn <= nlbn) {
+		*eobp = fragroundup(fs, size);
+	} else {
+		*eobp = blkroundup(fs, size);
+	}
+}
+
+#ifdef DEBUG
+void lfs_dump_vop(void *);
+
+void
+lfs_dump_vop(void *v)
+{
+	struct vop_putpages_args /* {
+		struct vnode *a_vp;
+		voff_t a_offlo;
+		voff_t a_offhi;
+		int a_flags;
+	} */ *ap = v;
+
+#ifdef DDB
+	vfs_vnode_print(ap->a_vp, 0, printf);
+#endif
+	lfs_dump_dinode(VTOI(ap->a_vp)->i_din.ffs1_din);
+}
+#endif
+
+int
+lfs_mmap(void *v)
+{
+	struct vop_mmap_args /* {
+		const struct vnodeop_desc *a_desc;
+		struct vnode *a_vp;
+		vm_prot_t a_prot;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+
+	if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM)
+		return EOPNOTSUPP;
+	return ufs_mmap(v);
+}
diff --git a/sys/ufs/mfs/Makefile b/sys/ufs/mfs/Makefile
new file mode 100644
index 000000000..c0fdca997
--- /dev/null
+++ b/sys/ufs/mfs/Makefile
@@ -0,0 +1,7 @@
+#	$NetBSD: Makefile,v 1.2 1999/07/03 18:40:32 thorpej Exp $
+
+INCSDIR= /usr/include/ufs/mfs
+
+INCS=	mfs_extern.h mfsnode.h
+
+.include <bsd.kinc.mk>
diff --git a/include/ufs/mfs/mfs_extern.h b/sys/ufs/mfs/mfs_extern.h
similarity index 100%
rename from include/ufs/mfs/mfs_extern.h
rename to sys/ufs/mfs/mfs_extern.h
diff --git a/sys/ufs/mfs/mfs_miniroot.c b/sys/ufs/mfs/mfs_miniroot.c
new file mode 100644
index 000000000..cfd4a03b6
--- /dev/null
+++ b/sys/ufs/mfs/mfs_miniroot.c
@@ -0,0 +1,68 @@
+/*	$NetBSD: mfs_miniroot.c,v 1.1 2010/03/02 17:20:02 pooka Exp $	*/
+
+/*
+ * Copyright (c) 1989, 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)mfs_vfsops.c	8.11 (Berkeley) 6/19/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: mfs_miniroot.c,v 1.1 2010/03/02 17:20:02 pooka Exp $");
+
+#include <sys/param.h>
+
+#include <ufs/mfs/mfs_extern.h>
+#include <ufs/ffs/fs.h>
+
+void *	mfs_rootbase;	/* address of mini-root in kernel virtual memory */
+u_long	mfs_rootsize;	/* size of mini-root in bytes */
+
+/*
+ * This is called early in boot to set the base address and size
+ * of the mini-root.
+ */
+int
+mfs_initminiroot(void *base)
+{
+	struct fs *fs = (struct fs *)((char *)base + SBLOCK_UFS1);
+	static bool inited = false;
+
+	if (inited)
+		panic("mfs_initminiroot() called more than once");
+	inited = true;
+
+	/* check for valid super block */
+	if (fs->fs_magic != FS_UFS1_MAGIC || fs->fs_bsize > MAXBSIZE ||
+	    fs->fs_bsize < sizeof(struct fs))
+		return (0);
+	rootfstype = MOUNT_MFS;
+	mfs_rootbase = base;
+	mfs_rootsize = fs->fs_fsize * fs->fs_size;
+	rootdev = makedev(255, 0);
+	return (mfs_rootsize);
+}
diff --git a/sys/ufs/mfs/mfs_vfsops.c b/sys/ufs/mfs/mfs_vfsops.c
new file mode 100644
index 000000000..292998dc0
--- /dev/null
+++ b/sys/ufs/mfs/mfs_vfsops.c
@@ -0,0 +1,444 @@
+/*	$NetBSD: mfs_vfsops.c,v 1.103 2011/06/12 03:36:01 rmind Exp $	*/
+
+/*
+ * Copyright (c) 1989, 1990, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)mfs_vfsops.c	8.11 (Berkeley) 6/19/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: mfs_vfsops.c,v 1.103 2011/06/12 03:36:01 rmind Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_compat_netbsd.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/bufq.h>
+#include <sys/mount.h>
+#include <sys/signalvar.h>
+#include <sys/vnode.h>
+#include <sys/kmem.h>
+#include <sys/module.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#include <ufs/mfs/mfsnode.h>
+#include <ufs/mfs/mfs_extern.h>
+
+MODULE(MODULE_CLASS_VFS, mfs, "ffs");
+
+kmutex_t mfs_lock;	/* global lock */
+
+/* used for building internal dev_t, minor == 0 reserved for miniroot */
+static int mfs_minor = 1;
+static int mfs_initcnt;
+
+extern int (**mfs_vnodeop_p)(void *);
+
+static struct sysctllog *mfs_sysctl_log;
+
+/*
+ * mfs vfs operations.
+ */
+
+extern const struct vnodeopv_desc mfs_vnodeop_opv_desc;
+
+const struct vnodeopv_desc * const mfs_vnodeopv_descs[] = {
+	&mfs_vnodeop_opv_desc,
+	NULL,
+};
+
+struct vfsops mfs_vfsops = {
+	MOUNT_MFS,
+	sizeof (struct mfs_args),
+	mfs_mount,
+	mfs_start,
+	ffs_unmount,
+	ufs_root,
+	ufs_quotactl,
+	mfs_statvfs,
+	ffs_sync,
+	ffs_vget,
+	ffs_fhtovp,
+	ffs_vptofh,
+	mfs_init,
+	mfs_reinit,
+	mfs_done,
+	NULL,
+	(int (*)(struct mount *, struct vnode *, struct timespec *)) eopnotsupp,
+	vfs_stdextattrctl,
+	(void *)eopnotsupp,	/* vfs_suspendctl */
+	genfs_renamelock_enter,
+	genfs_renamelock_exit,
+	(void *)eopnotsupp,
+	mfs_vnodeopv_descs,
+	0,
+	{ NULL, NULL },
+};
+
+static int
+mfs_modcmd(modcmd_t cmd, void *arg)
+{
+	int error;
+
+	switch (cmd) {
+	case MODULE_CMD_INIT:
+		error = vfs_attach(&mfs_vfsops);
+		if (error != 0)
+			break;
+		sysctl_createv(&mfs_sysctl_log, 0, NULL, NULL,
+			       CTLFLAG_PERMANENT,
+			       CTLTYPE_NODE, "vfs", NULL,
+			       NULL, 0, NULL, 0,
+			       CTL_VFS, CTL_EOL);
+		sysctl_createv(&mfs_sysctl_log, 0, NULL, NULL,
+			       CTLFLAG_PERMANENT|CTLFLAG_ALIAS,
+			       CTLTYPE_NODE, "mfs",
+			       SYSCTL_DESCR("Memory based file system"),
+			       NULL, 1, NULL, 0,
+			       CTL_VFS, 3, CTL_EOL);
+		/*
+		 * XXX the "1" and the "3" above could be dynamic, thereby
+		 * eliminating one more instance of the "number to vfs"
+		 * mapping problem, but they are in order as taken from
+		 * sys/mount.h
+		 */
+		break;
+	case MODULE_CMD_FINI:
+		error = vfs_detach(&mfs_vfsops);
+		if (error != 0)
+			break;
+		sysctl_teardown(&mfs_sysctl_log);
+		break;
+	default:
+		error = ENOTTY;
+		break;
+	}
+
+	return (error);
+}
+
+/*
+ * Memory based filesystem initialization.
+ */
+void
+mfs_init(void)
+{
+
+	if (mfs_initcnt++ == 0) {
+		mutex_init(&mfs_lock, MUTEX_DEFAULT, IPL_NONE);
+		ffs_init();
+	}
+}
+
+void
+mfs_reinit(void)
+{
+
+	ffs_reinit();
+}
+
+void
+mfs_done(void)
+{
+
+	if (--mfs_initcnt == 0) {
+		ffs_done();
+		mutex_destroy(&mfs_lock);
+	}
+}
+
+/*
+ * Called by main() when mfs is going to be mounted as root.
+ */
+
+int
+mfs_mountroot(void)
+{
+	struct fs *fs;
+	struct mount *mp;
+	struct lwp *l = curlwp;		/* XXX */
+	struct ufsmount *ump;
+	struct mfsnode *mfsp;
+	int error = 0;
+
+	if ((error = vfs_rootmountalloc(MOUNT_MFS, "mfs_root", &mp))) {
+		vrele(rootvp);
+		return (error);
+	}
+
+	mfsp = kmem_alloc(sizeof(*mfsp), KM_SLEEP);
+	rootvp->v_data = mfsp;
+	rootvp->v_op = mfs_vnodeop_p;
+	rootvp->v_tag = VT_MFS;
+	mfsp->mfs_baseoff = mfs_rootbase;
+	mfsp->mfs_size = mfs_rootsize;
+	mfsp->mfs_vnode = rootvp;
+	mfsp->mfs_proc = NULL;		/* indicate kernel space */
+	mfsp->mfs_shutdown = 0;
+	cv_init(&mfsp->mfs_cv, "mfs");
+	mfsp->mfs_refcnt = 1;
+	bufq_alloc(&mfsp->mfs_buflist, "fcfs", 0);
+	if ((error = ffs_mountfs(rootvp, mp, l)) != 0) {
+		vfs_unbusy(mp, false, NULL);
+		bufq_free(mfsp->mfs_buflist);
+		vfs_destroy(mp);
+		kmem_free(mfsp, sizeof(*mfsp));
+		return (error);
+	}
+	mutex_enter(&mountlist_lock);
+	CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+	mutex_exit(&mountlist_lock);
+	mp->mnt_vnodecovered = NULLVP;
+	ump = VFSTOUFS(mp);
+	fs = ump->um_fs;
+	(void) copystr(mp->mnt_stat.f_mntonname, fs->fs_fsmnt, MNAMELEN - 1, 0);
+	(void)ffs_statvfs(mp, &mp->mnt_stat);
+	vfs_unbusy(mp, false, NULL);
+	return (0);
+}
+
+/*
+ * VFS Operations.
+ *
+ * mount system call
+ */
+/* ARGSUSED */
+int
+mfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
+{
+	struct lwp *l = curlwp;
+	struct vnode *devvp;
+	struct mfs_args *args = data;
+	struct ufsmount *ump;
+	struct fs *fs;
+	struct mfsnode *mfsp;
+	struct proc *p;
+	int flags, error = 0;
+
+	if (*data_len < sizeof *args)
+		return EINVAL;
+
+	p = l->l_proc;
+	if (mp->mnt_flag & MNT_GETARGS) {
+		struct vnode *vp;
+
+		ump = VFSTOUFS(mp);
+		if (ump == NULL)
+			return EIO;
+
+		vp = ump->um_devvp;
+		if (vp == NULL)
+			return EIO;
+
+		mfsp = VTOMFS(vp);
+		if (mfsp == NULL)
+			return EIO;
+
+		args->fspec = NULL;
+		args->base = mfsp->mfs_baseoff;
+		args->size = mfsp->mfs_size;
+		*data_len = sizeof *args;
+		return 0;
+	}
+	/*
+	 * XXX turn off async to avoid hangs when writing lots of data.
+	 * the problem is that MFS needs to allocate pages to clean pages,
+	 * so if we wait until the last minute to clean pages then there
+	 * may not be any pages available to do the cleaning.
+	 * ... and since the default partially-synchronous mode turns out
+	 * to not be sufficient under heavy load, make it full synchronous.
+	 */
+	mp->mnt_flag &= ~MNT_ASYNC;
+	mp->mnt_flag |= MNT_SYNCHRONOUS;
+
+	/*
+	 * If updating, check whether changing from read-only to
+	 * read/write; if there is no device name, that's all we do.
+	 */
+	if (mp->mnt_flag & MNT_UPDATE) {
+		ump = VFSTOUFS(mp);
+		fs = ump->um_fs;
+		if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
+			flags = WRITECLOSE;
+			if (mp->mnt_flag & MNT_FORCE)
+				flags |= FORCECLOSE;
+			error = ffs_flushfiles(mp, flags, l);
+			if (error)
+				return (error);
+		}
+		if (fs->fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR))
+			fs->fs_ronly = 0;
+		if (args->fspec == NULL)
+			return EINVAL;
+		return (0);
+	}
+	error = getnewvnode(VT_MFS, NULL, mfs_vnodeop_p, NULL, &devvp);
+	if (error)
+		return (error);
+	devvp->v_vflag |= VV_MPSAFE;
+	devvp->v_type = VBLK;
+	spec_node_init(devvp, makedev(255, mfs_minor));
+	mfs_minor++;
+	mfsp = kmem_alloc(sizeof(*mfsp), KM_SLEEP);
+	devvp->v_data = mfsp;
+	mfsp->mfs_baseoff = args->base;
+	mfsp->mfs_size = args->size;
+	mfsp->mfs_vnode = devvp;
+	mfsp->mfs_proc = p;
+	mfsp->mfs_shutdown = 0;
+	cv_init(&mfsp->mfs_cv, "mfsidl");
+	mfsp->mfs_refcnt = 1;
+	bufq_alloc(&mfsp->mfs_buflist, "fcfs", 0);
+	if ((error = ffs_mountfs(devvp, mp, l)) != 0) {
+		mfsp->mfs_shutdown = 1;
+		vrele(devvp);
+		return (error);
+	}
+	ump = VFSTOUFS(mp);
+	fs = ump->um_fs;
+	error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
+	    UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+	if (error)
+		return error;
+	(void)strncpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname,
+		sizeof(fs->fs_fsmnt));
+	fs->fs_fsmnt[sizeof(fs->fs_fsmnt) - 1] = '\0';
+	/* XXX: cleanup on error */
+	return 0;
+}
+
+/*
+ * Used to grab the process and keep it in the kernel to service
+ * memory filesystem I/O requests.
+ *
+ * Loop servicing I/O requests.
+ * Copy the requested data into or out of the memory filesystem
+ * address space.
+ */
+/* ARGSUSED */
+int
+mfs_start(struct mount *mp, int flags)
+{
+	struct vnode *vp;
+	struct mfsnode *mfsp;
+	struct proc *p;
+	struct buf *bp;
+	void *base;
+	int sleepreturn = 0, refcnt, error;
+	ksiginfoq_t kq;
+
+	/*
+	 * Ensure that file system is still mounted when getting mfsnode.
+	 * Add a reference to the mfsnode to prevent it disappearing in
+	 * this routine.
+	 */
+	if ((error = vfs_busy(mp, NULL)) != 0)
+		return error;
+	vp = VFSTOUFS(mp)->um_devvp;
+	mfsp = VTOMFS(vp);
+	mutex_enter(&mfs_lock);
+	mfsp->mfs_refcnt++;
+	mutex_exit(&mfs_lock);
+	vfs_unbusy(mp, false, NULL);
+
+	base = mfsp->mfs_baseoff;
+	mutex_enter(&mfs_lock);
+	while (mfsp->mfs_shutdown != 1) {
+		while ((bp = bufq_get(mfsp->mfs_buflist)) != NULL) {
+			mutex_exit(&mfs_lock);
+			mfs_doio(bp, base);
+			mutex_enter(&mfs_lock);
+		}
+		/*
+		 * If a non-ignored signal is received, try to unmount.
+		 * If that fails, or the filesystem is already in the
+		 * process of being unmounted, clear the signal (it has been
+		 * "processed"), otherwise we will loop here, as tsleep
+		 * will always return EINTR/ERESTART.
+		 */
+		if (sleepreturn != 0) {
+			mutex_exit(&mfs_lock);
+			if (dounmount(mp, 0, curlwp) != 0) {
+				p = curproc;
+				ksiginfo_queue_init(&kq);
+				mutex_enter(p->p_lock);
+				sigclearall(p, NULL, &kq);
+				mutex_exit(p->p_lock);
+				ksiginfo_queue_drain(&kq);
+			}
+			sleepreturn = 0;
+			mutex_enter(&mfs_lock);
+			continue;
+		}
+
+		sleepreturn = cv_wait_sig(&mfsp->mfs_cv, &mfs_lock);
+	}
+	KASSERT(bufq_peek(mfsp->mfs_buflist) == NULL);
+	refcnt = --mfsp->mfs_refcnt;
+	mutex_exit(&mfs_lock);
+	if (refcnt == 0) {
+		bufq_free(mfsp->mfs_buflist);
+		cv_destroy(&mfsp->mfs_cv);
+		kmem_free(mfsp, sizeof(*mfsp));
+	}
+	return (sleepreturn);
+}
+
+/*
+ * Get file system statistics.
+ */
+int
+mfs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+	int error;
+
+	error = ffs_statvfs(mp, sbp);
+	if (error)
+		return error;
+	(void)strncpy(sbp->f_fstypename, mp->mnt_op->vfs_name,
+	    sizeof(sbp->f_fstypename));
+	sbp->f_fstypename[sizeof(sbp->f_fstypename) - 1] = '\0';
+	return 0;
+}
diff --git a/sys/ufs/mfs/mfs_vnops.c b/sys/ufs/mfs/mfs_vnops.c
new file mode 100644
index 000000000..53a2c5874
--- /dev/null
+++ b/sys/ufs/mfs/mfs_vnops.c
@@ -0,0 +1,327 @@
+/*	$NetBSD: mfs_vnops.c,v 1.54 2010/06/24 13:03:19 hannken Exp $	*/
+
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)mfs_vnops.c	8.11 (Berkeley) 5/22/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: mfs_vnops.c,v 1.54 2010/06/24 13:03:19 hannken Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/bufq.h>
+#include <sys/vnode.h>
+#include <sys/kmem.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <machine/vmparam.h>
+
+#include <ufs/mfs/mfsnode.h>
+#include <ufs/mfs/mfs_extern.h>
+
+/*
+ * mfs vnode operations.
+ */
+int (**mfs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc mfs_vnodeop_entries[] = {
+	{ &vop_default_desc, vn_default_error },
+	{ &vop_lookup_desc, mfs_lookup },		/* lookup */
+	{ &vop_create_desc, mfs_create },		/* create */
+	{ &vop_mknod_desc, mfs_mknod },			/* mknod */
+	{ &vop_open_desc, mfs_open },			/* open */
+	{ &vop_close_desc, mfs_close },			/* close */
+	{ &vop_access_desc, mfs_access },		/* access */
+	{ &vop_getattr_desc, mfs_getattr },		/* getattr */
+	{ &vop_setattr_desc, mfs_setattr },		/* setattr */
+	{ &vop_read_desc, mfs_read },			/* read */
+	{ &vop_write_desc, mfs_write },			/* write */
+	{ &vop_ioctl_desc, mfs_ioctl },			/* ioctl */
+	{ &vop_poll_desc, mfs_poll },			/* poll */
+	{ &vop_revoke_desc, mfs_revoke },		/* revoke */
+	{ &vop_mmap_desc, mfs_mmap },			/* mmap */
+	{ &vop_fsync_desc, spec_fsync },		/* fsync */
+	{ &vop_seek_desc, mfs_seek },			/* seek */
+	{ &vop_remove_desc, mfs_remove },		/* remove */
+	{ &vop_link_desc, mfs_link },			/* link */
+	{ &vop_rename_desc, mfs_rename },		/* rename */
+	{ &vop_mkdir_desc, mfs_mkdir },			/* mkdir */
+	{ &vop_rmdir_desc, mfs_rmdir },			/* rmdir */
+	{ &vop_symlink_desc, mfs_symlink },		/* symlink */
+	{ &vop_readdir_desc, mfs_readdir },		/* readdir */
+	{ &vop_readlink_desc, mfs_readlink },		/* readlink */
+	{ &vop_abortop_desc, mfs_abortop },		/* abortop */
+	{ &vop_inactive_desc, mfs_inactive },		/* inactive */
+	{ &vop_reclaim_desc, mfs_reclaim },		/* reclaim */
+	{ &vop_lock_desc, genfs_nolock },		/* lock */
+	{ &vop_unlock_desc, genfs_nounlock },		/* unlock */
+	{ &vop_bmap_desc, mfs_bmap },			/* bmap */
+	{ &vop_strategy_desc, mfs_strategy },		/* strategy */
+	{ &vop_print_desc, mfs_print },			/* print */
+	{ &vop_islocked_desc, mfs_islocked },		/* islocked */
+	{ &vop_pathconf_desc, mfs_pathconf },		/* pathconf */
+	{ &vop_advlock_desc, mfs_advlock },		/* advlock */
+	{ &vop_bwrite_desc, mfs_bwrite },		/* bwrite */
+	{ &vop_putpages_desc, mfs_putpages },		/* putpages */
+	{ NULL, NULL }
+};
+const struct vnodeopv_desc mfs_vnodeop_opv_desc =
+	{ &mfs_vnodeop_p, mfs_vnodeop_entries };
+
+/*
+ * Vnode Operations.
+ *
+ * Open called to allow memory filesystem to initialize and
+ * validate before actual IO. Record our process identifier
+ * so we can tell when we are doing I/O to ourself.
+ */
+/* ARGSUSED */
+int
+mfs_open(void *v)
+{
+	struct vop_open_args /* {
+		struct vnode *a_vp;
+		int  a_mode;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+
+	if (ap->a_vp->v_type != VBLK) {
+		panic("mfs_ioctl not VBLK");
+		/* NOTREACHED */
+	}
+	return (0);
+}
+
+/*
+ * Pass I/O requests to the memory filesystem process.
+ */
+int
+mfs_strategy(void *v)
+{
+	struct vop_strategy_args /* {
+		struct vnode *a_vp;
+		struct buf *a_bp;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct buf *bp = ap->a_bp;
+	struct mfsnode *mfsp;
+
+	if (vp->v_type != VBLK || vp->v_usecount == 0)
+		panic("mfs_strategy: bad dev");
+	mfsp = VTOMFS(vp);
+	/* check for mini-root access */
+	if (mfsp->mfs_proc == NULL) {
+		void *base;
+
+		base = (char *)mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT);
+		if (bp->b_flags & B_READ)
+			memcpy(bp->b_data, base, bp->b_bcount);
+		else
+			memcpy(base, bp->b_data, bp->b_bcount);
+		bp->b_resid = 0;
+		biodone(bp);
+	} else if (mfsp->mfs_proc == curproc) {
+		mfs_doio(bp, mfsp->mfs_baseoff);
+	} else if (doing_shutdown) {
+		/*
+		 * bitbucket I/O during shutdown.
+		 * Note that reads should *not* happen here, but..
+		 */
+		if (bp->b_flags & B_READ)
+			printf("warning: mfs read during shutdown\n");
+		bp->b_resid = 0;
+		biodone(bp);
+	} else {
+		mutex_enter(&mfs_lock);
+		bufq_put(mfsp->mfs_buflist, bp);
+		cv_broadcast(&mfsp->mfs_cv);
+		mutex_exit(&mfs_lock);
+	}
+	return (0);
+}
+
+/*
+ * Memory file system I/O.
+ */
+void
+mfs_doio(struct buf *bp, void *base)
+{
+
+	base = (char *)base + (bp->b_blkno << DEV_BSHIFT);
+	if (bp->b_flags & B_READ)
+		bp->b_error = copyin(base, bp->b_data, bp->b_bcount);
+	else
+		bp->b_error = copyout(bp->b_data, base, bp->b_bcount);
+	if (bp->b_error == 0)
+		bp->b_resid = 0;
+	biodone(bp);
+}
+
+/*
+ * This is a noop, simply returning what one has been given.
+ */
+int
+mfs_bmap(void *v)
+{
+	struct vop_bmap_args /* {
+		struct vnode *a_vp;
+		daddr_t  a_bn;
+		struct vnode **a_vpp;
+		daddr_t *a_bnp;
+		int *a_runp;
+	} */ *ap = v;
+
+	if (ap->a_vpp != NULL)
+		*ap->a_vpp = ap->a_vp;
+	if (ap->a_bnp != NULL)
+		*ap->a_bnp = ap->a_bn;
+	if (ap->a_runp != NULL)
+		 *ap->a_runp = 0;
+	return (0);
+}
+
+/*
+ * Memory filesystem close routine
+ */
+/* ARGSUSED */
+int
+mfs_close(void *v)
+{
+	struct vop_close_args /* {
+		struct vnode *a_vp;
+		int  a_fflag;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct mfsnode *mfsp = VTOMFS(vp);
+	struct buf *bp;
+	int error;
+
+	/*
+	 * Finish any pending I/O requests.
+	 */
+	mutex_enter(&mfs_lock);
+	while ((bp = bufq_get(mfsp->mfs_buflist)) != NULL) {
+		mutex_exit(&mfs_lock);
+		mfs_doio(bp, mfsp->mfs_baseoff);
+		mutex_enter(&mfs_lock);
+	}
+	mutex_exit(&mfs_lock);
+	/*
+	 * On last close of a memory filesystem
+	 * we must invalidate any in core blocks, so that
+	 * we can, free up its vnode.
+	 */
+	if ((error = vinvalbuf(vp, V_SAVE, ap->a_cred, curlwp, 0, 0)) != 0)
+		return (error);
+	/*
+	 * There should be no way to have any more uses of this
+	 * vnode, so if we find any other uses, it is a panic.
+	 */
+	if (bufq_peek(mfsp->mfs_buflist) != NULL)
+		panic("mfs_close");
+	/*
+	 * Send a request to the filesystem server to exit.
+	 */
+	mutex_enter(&mfs_lock);
+	mfsp->mfs_shutdown = 1;
+	cv_broadcast(&mfsp->mfs_cv);
+	mutex_exit(&mfs_lock);
+	return (0);
+}
+
+/*
+ * Memory filesystem inactive routine
+ */
+/* ARGSUSED */
+int
+mfs_inactive(void *v)
+{
+	struct vop_inactive_args /* {
+		struct vnode *a_vp;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct mfsnode *mfsp = VTOMFS(vp);
+
+	if (bufq_peek(mfsp->mfs_buflist) != NULL)
+		panic("mfs_inactive: not inactive (mfs_buflist %p)",
+			bufq_peek(mfsp->mfs_buflist));
+	VOP_UNLOCK(vp);
+	return (0);
+}
+
+/*
+ * Reclaim a memory filesystem devvp so that it can be reused.
+ */
+int
+mfs_reclaim(void *v)
+{
+	struct vop_reclaim_args /* {
+		struct vnode *a_vp;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct mfsnode *mfsp = VTOMFS(vp);
+	int refcnt;
+
+	mutex_enter(&mfs_lock);
+	vp->v_data = NULL;
+	refcnt = --mfsp->mfs_refcnt;
+	mutex_exit(&mfs_lock);
+
+	if (refcnt == 0) {
+		bufq_free(mfsp->mfs_buflist);
+		cv_destroy(&mfsp->mfs_cv);
+		kmem_free(mfsp, sizeof(*mfsp));
+	}
+
+	return (0);
+}
+
+/*
+ * Print out the contents of an mfsnode.
+ */
+int
+mfs_print(void *v)
+{
+	struct vop_print_args /* {
+		struct vnode *a_vp;
+	} */ *ap = v;
+	struct mfsnode *mfsp = VTOMFS(ap->a_vp);
+
+	printf("tag VT_MFS, pid %d, base %p, size %ld\n",
+	    (mfsp->mfs_proc != NULL) ? mfsp->mfs_proc->p_pid : 0,
+	    mfsp->mfs_baseoff, mfsp->mfs_size);
+	return (0);
+}
diff --git a/include/ufs/mfs/mfsnode.h b/sys/ufs/mfs/mfsnode.h
similarity index 100%
rename from include/ufs/mfs/mfsnode.h
rename to sys/ufs/mfs/mfsnode.h
diff --git a/sys/ufs/ufs/Makefile b/sys/ufs/ufs/Makefile
new file mode 100644
index 000000000..6f08db609
--- /dev/null
+++ b/sys/ufs/ufs/Makefile
@@ -0,0 +1,8 @@
+#	$NetBSD: Makefile,v 1.7 2011/03/06 17:08:39 bouyer Exp $
+
+INCSDIR= /usr/include/ufs/ufs
+
+INCS=	dinode.h dir.h extattr.h inode.h quota.h quota1.h quota2.h \
+	ufs_bswap.h ufs_extern.h ufs_wapbl.h ufsmount.h
+
+.include <bsd.kinc.mk>
diff --git a/include/ufs/ufs/dinode.h b/sys/ufs/ufs/dinode.h
similarity index 100%
rename from include/ufs/ufs/dinode.h
rename to sys/ufs/ufs/dinode.h
diff --git a/include/ufs/ufs/dir.h b/sys/ufs/ufs/dir.h
similarity index 100%
rename from include/ufs/ufs/dir.h
rename to sys/ufs/ufs/dir.h
diff --git a/include/ufs/ufs/dirhash.h b/sys/ufs/ufs/dirhash.h
similarity index 100%
rename from include/ufs/ufs/dirhash.h
rename to sys/ufs/ufs/dirhash.h
diff --git a/include/ufs/ufs/extattr.h b/sys/ufs/ufs/extattr.h
similarity index 100%
rename from include/ufs/ufs/extattr.h
rename to sys/ufs/ufs/extattr.h
diff --git a/include/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h
similarity index 100%
rename from include/ufs/ufs/inode.h
rename to sys/ufs/ufs/inode.h
diff --git a/include/ufs/ufs/quota.h b/sys/ufs/ufs/quota.h
similarity index 100%
rename from include/ufs/ufs/quota.h
rename to sys/ufs/ufs/quota.h
diff --git a/include/ufs/ufs/quota1.h b/sys/ufs/ufs/quota1.h
similarity index 100%
rename from include/ufs/ufs/quota1.h
rename to sys/ufs/ufs/quota1.h
diff --git a/sys/ufs/ufs/quota1_subr.c b/sys/ufs/ufs/quota1_subr.c
new file mode 100644
index 000000000..ff6a06c92
--- /dev/null
+++ b/sys/ufs/ufs/quota1_subr.c
@@ -0,0 +1,95 @@
+/* $NetBSD: quota1_subr.c,v 1.6 2011/11/25 16:55:05 dholland Exp $ */
+/*-
+  * Copyright (c) 2010 Manuel Bouyer
+  * All rights reserved.
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions
+  * are met:
+  * 1. Redistributions of source code must retain the above copyright
+  *    notice, this list of conditions and the following disclaimer.
+  * 2. Redistributions in binary form must reproduce the above copyright
+  *    notice, this list of conditions and the following disclaimer in the
+  *    documentation and/or other materials provided with the distribution.
+  *
+  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  * POSSIBILITY OF SUCH DAMAGE.
+  */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: quota1_subr.c,v 1.6 2011/11/25 16:55:05 dholland Exp $");
+
+#include <sys/types.h>
+#include <machine/limits.h>
+
+#include <sys/quota.h>
+#include <quota/quotaprop.h>
+#include <ufs/ufs/quota1.h>
+
+static uint64_t
+dqblk2q2e_limit(uint32_t lim)
+{
+	if (lim == 0)
+		return UQUAD_MAX;
+	else
+		return (lim - 1);
+}
+
+static uint32_t
+q2e2dqblk_limit(uint64_t lim)
+{
+	if (lim == UQUAD_MAX)
+		return 0;
+	else
+		return (lim + 1);
+}
+
+void
+dqblk_to_quotaval(const struct dqblk *dqblk, struct quotaval *qv)
+{
+	/* XXX is qv_grace getting handled correctly? */
+
+	qv[QUOTA_LIMIT_BLOCK].qv_hardlimit =
+	    dqblk2q2e_limit(dqblk->dqb_bhardlimit);
+	qv[QUOTA_LIMIT_BLOCK].qv_softlimit =
+	    dqblk2q2e_limit(dqblk->dqb_bsoftlimit);
+	qv[QUOTA_LIMIT_BLOCK].qv_usage       = dqblk->dqb_curblocks;
+	qv[QUOTA_LIMIT_BLOCK].qv_expiretime      = dqblk->dqb_btime;
+
+	qv[QUOTA_LIMIT_FILE].qv_hardlimit =
+	    dqblk2q2e_limit(dqblk->dqb_ihardlimit);
+	qv[QUOTA_LIMIT_FILE].qv_softlimit =
+	    dqblk2q2e_limit(dqblk->dqb_isoftlimit);
+	qv[QUOTA_LIMIT_FILE].qv_usage       = dqblk->dqb_curinodes;
+	qv[QUOTA_LIMIT_FILE].qv_expiretime      = dqblk->dqb_itime;
+}
+
+void
+quotaval_to_dqblk(const struct quotaval *qv, struct dqblk *dqblk)
+{
+	/* XXX is qv_grace getting handled correctly? */
+
+	dqblk->dqb_bhardlimit =
+	    q2e2dqblk_limit(qv[QUOTA_LIMIT_BLOCK].qv_hardlimit);
+	dqblk->dqb_bsoftlimit =
+	    q2e2dqblk_limit(qv[QUOTA_LIMIT_BLOCK].qv_softlimit);
+	dqblk->dqb_curblocks  = qv[QUOTA_LIMIT_BLOCK].qv_usage;
+	dqblk->dqb_btime      = qv[QUOTA_LIMIT_BLOCK].qv_expiretime;
+
+	dqblk->dqb_ihardlimit =
+	    q2e2dqblk_limit(qv[QUOTA_LIMIT_FILE].qv_hardlimit);
+	dqblk->dqb_isoftlimit =
+	    q2e2dqblk_limit(qv[QUOTA_LIMIT_FILE].qv_softlimit);
+	dqblk->dqb_curinodes  = qv[QUOTA_LIMIT_FILE].qv_usage;
+	dqblk->dqb_itime      = qv[QUOTA_LIMIT_FILE].qv_expiretime;
+}
+
diff --git a/include/ufs/ufs/quota2.h b/sys/ufs/ufs/quota2.h
similarity index 100%
rename from include/ufs/ufs/quota2.h
rename to sys/ufs/ufs/quota2.h
diff --git a/sys/ufs/ufs/quota2_subr.c b/sys/ufs/ufs/quota2_subr.c
new file mode 100644
index 000000000..f91007f1b
--- /dev/null
+++ b/sys/ufs/ufs/quota2_subr.c
@@ -0,0 +1,108 @@
+/* $NetBSD: quota2_subr.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $ */
+/*-
+  * Copyright (c) 2010 Manuel Bouyer
+  * All rights reserved.
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions
+  * are met:
+  * 1. Redistributions of source code must retain the above copyright
+  *    notice, this list of conditions and the following disclaimer.
+  * 2. Redistributions in binary form must reproduce the above copyright
+  *    notice, this list of conditions and the following disclaimer in the
+  *    documentation and/or other materials provided with the distribution.
+  *
+  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  * POSSIBILITY OF SUCH DAMAGE.
+  */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: quota2_subr.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $");
+
+#include <sys/param.h>
+#include <sys/time.h>
+
+#include <ufs/ufs/dinode.h>
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/quota2.h>
+
+#ifndef _KERNEL
+#include <string.h>
+#endif
+
+void
+quota2_addfreeq2e(struct quota2_header *q2h, void *bp, uint64_t baseoff,
+    uint64_t bsize, int ns)
+{
+	uint64_t blkoff = baseoff % bsize;
+	int i, nq2e;
+	struct quota2_entry *q2e;
+
+	q2e = (void *)((char *)bp + blkoff);
+	nq2e = (bsize - blkoff) / sizeof(*q2e);
+	for (i = 0; i < nq2e; i++) {
+		q2e[i].q2e_next = q2h->q2h_free;
+		q2h->q2h_free = ufs_rw64(i * sizeof(*q2e) + baseoff, ns);
+	}
+}
+
+void
+quota2_create_blk0(uint64_t bsize, void *bp, int q2h_hash_shift, int type,
+    int ns)
+{
+	struct quota2_header *q2h;
+	const int quota2_hash_size = 1 << q2h_hash_shift;
+	const int quota2_full_header_size = sizeof(struct quota2_header) +
+	    sizeof(q2h->q2h_entries[0]) * quota2_hash_size;
+	int i;
+
+	memset(bp, 0, bsize);
+	q2h = bp;
+	q2h->q2h_magic_number = ufs_rw32(Q2_HEAD_MAGIC, ns);
+	q2h->q2h_type = type;
+	q2h->q2h_hash_shift = q2h_hash_shift;
+	q2h->q2h_hash_size = ufs_rw16(quota2_hash_size, ns);
+	/* setup defaut entry: unlimited, 7 days grace */
+	for (i = 0; i < N_QL; i++) {
+		q2h->q2h_defentry.q2e_val[i].q2v_hardlimit =
+		    q2h->q2h_defentry.q2e_val[i].q2v_softlimit =
+		    ufs_rw64(UQUAD_MAX, ns);
+		q2h->q2h_defentry.q2e_val[i].q2v_grace =
+		    ufs_rw64(7ULL * 24ULL * 3600ULL, ns);
+	}
+
+	/* first quota entry, after the hash table */
+	quota2_addfreeq2e(q2h, bp, quota2_full_header_size, bsize, ns);
+}
+
+void
+quota2_ufs_rwq2v(const struct quota2_val *s, struct quota2_val *d, int needswap)
+{
+	d->q2v_hardlimit = ufs_rw64(s->q2v_hardlimit, needswap);
+	d->q2v_softlimit = ufs_rw64(s->q2v_softlimit, needswap);
+	d->q2v_cur = ufs_rw64(s->q2v_cur, needswap);
+	d->q2v_time = ufs_rw64(s->q2v_time, needswap);
+	d->q2v_grace = ufs_rw64(s->q2v_grace, needswap);
+}
+
+void
+quota2_ufs_rwq2e(const struct quota2_entry *s, struct quota2_entry *d,
+int needswap)
+{
+	quota2_ufs_rwq2v(&s->q2e_val[QL_BLOCK], &d->q2e_val[QL_BLOCK],
+	    needswap);
+	quota2_ufs_rwq2v(&s->q2e_val[QL_FILE], &d->q2e_val[QL_FILE],
+	    needswap);
+	d->q2e_uid = ufs_rw32(s->q2e_uid, needswap);
+}
diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c
new file mode 100644
index 000000000..3420e227a
--- /dev/null
+++ b/sys/ufs/ufs/ufs_bmap.c
@@ -0,0 +1,405 @@
+/*	$NetBSD: ufs_bmap.c,v 1.49 2011/03/06 17:08:39 bouyer Exp $	*/
+
+/*
+ * Copyright (c) 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_bmap.c	8.8 (Berkeley) 8/11/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_bmap.c,v 1.49 2011/03/06 17:08:39 bouyer Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/trace.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+
+static bool
+ufs_issequential(const struct ufsmount *ump, daddr_t daddr0, daddr_t daddr1)
+{
+
+	/* for ufs, blocks in a hole is not 'contiguous'. */
+	if (daddr0 == 0)
+		return false;
+
+	return (daddr0 + ump->um_seqinc == daddr1);
+}
+
+/*
+ * Bmap converts the logical block number of a file to its physical block
+ * number on the disk. The conversion is done by using the logical block
+ * number to index into the array of block pointers described by the dinode.
+ */
+int
+ufs_bmap(void *v)
+{
+	struct vop_bmap_args /* {
+		struct vnode *a_vp;
+		daddr_t  a_bn;
+		struct vnode **a_vpp;
+		daddr_t *a_bnp;
+		int *a_runp;
+	} */ *ap = v;
+	int error;
+
+	/*
+	 * Check for underlying vnode requests and ensure that logical
+	 * to physical mapping is requested.
+	 */
+	if (ap->a_vpp != NULL)
+		*ap->a_vpp = VTOI(ap->a_vp)->i_devvp;
+	if (ap->a_bnp == NULL)
+		return (0);
+
+	fstrans_start(ap->a_vp->v_mount, FSTRANS_SHARED);
+	error = ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL,
+	    ap->a_runp, ufs_issequential);
+	fstrans_done(ap->a_vp->v_mount);
+	return error;
+}
+
+/*
+ * Indirect blocks are now on the vnode for the file.  They are given negative
+ * logical block numbers.  Indirect blocks are addressed by the negative
+ * address of the first data block to which they point.  Double indirect blocks
+ * are addressed by one less than the address of the first indirect block to
+ * which they point.  Triple indirect blocks are addressed by one less than
+ * the address of the first double indirect block to which they point.
+ *
+ * ufs_bmaparray does the bmap conversion, and if requested returns the
+ * array of logical blocks which must be traversed to get to a block.
+ * Each entry contains the offset into that block that gets you to the
+ * next block and the disk address of the block (if it is assigned).
+ */
+
+int
+ufs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap,
+    int *nump, int *runp, ufs_issequential_callback_t is_sequential)
+{
+	struct inode *ip;
+	struct buf *bp, *cbp;
+	struct ufsmount *ump;
+	struct mount *mp;
+	struct indir a[NIADDR + 1], *xap;
+	daddr_t daddr;
+	daddr_t metalbn;
+	int error, maxrun = 0, num;
+
+	ip = VTOI(vp);
+	mp = vp->v_mount;
+	ump = ip->i_ump;
+#ifdef DIAGNOSTIC
+	if ((ap != NULL && nump == NULL) || (ap == NULL && nump != NULL))
+		panic("ufs_bmaparray: invalid arguments");
+#endif
+
+	if (runp) {
+		/*
+		 * XXX
+		 * If MAXBSIZE is the largest transfer the disks can handle,
+		 * we probably want maxrun to be 1 block less so that we
+		 * don't create a block larger than the device can handle.
+		 */
+		*runp = 0;
+		maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1;
+	}
+
+	if (bn >= 0 && bn < NDADDR) {
+		if (nump != NULL)
+			*nump = 0;
+		if (ump->um_fstype == UFS1)
+			daddr = ufs_rw32(ip->i_ffs1_db[bn],
+			    UFS_MPNEEDSWAP(ump));
+		else
+			daddr = ufs_rw64(ip->i_ffs2_db[bn],
+			    UFS_MPNEEDSWAP(ump));
+		*bnp = blkptrtodb(ump, daddr);
+		/*
+		 * Since this is FFS independent code, we are out of
+		 * scope for the definitions of BLK_NOCOPY and
+		 * BLK_SNAP, but we do know that they will fall in
+		 * the range 1..um_seqinc, so we use that test and
+		 * return a request for a zeroed out buffer if attempts
+		 * are made to read a BLK_NOCOPY or BLK_SNAP block.
+		 */
+		if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT
+		    && daddr > 0 &&
+		    daddr < ump->um_seqinc) {
+			*bnp = -1;
+		} else if (*bnp == 0) {
+			if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))
+			    == SF_SNAPSHOT) {
+				*bnp = blkptrtodb(ump, bn * ump->um_seqinc);
+			} else {
+				*bnp = -1;
+			}
+		} else if (runp) {
+			if (ump->um_fstype == UFS1) {
+				for (++bn; bn < NDADDR && *runp < maxrun &&
+				    is_sequential(ump,
+				        ufs_rw32(ip->i_ffs1_db[bn - 1],
+				            UFS_MPNEEDSWAP(ump)),
+				        ufs_rw32(ip->i_ffs1_db[bn],
+				            UFS_MPNEEDSWAP(ump)));
+				    ++bn, ++*runp);
+			} else {
+				for (++bn; bn < NDADDR && *runp < maxrun &&
+				    is_sequential(ump,
+				        ufs_rw64(ip->i_ffs2_db[bn - 1],
+				            UFS_MPNEEDSWAP(ump)),
+				        ufs_rw64(ip->i_ffs2_db[bn],
+				            UFS_MPNEEDSWAP(ump)));
+				    ++bn, ++*runp);
+			}
+		}
+		return (0);
+	}
+
+	xap = ap == NULL ? a : ap;
+	if (!nump)
+		nump = &num;
+	if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0)
+		return (error);
+
+	num = *nump;
+
+	/* Get disk address out of indirect block array */
+	if (ump->um_fstype == UFS1)
+		daddr = ufs_rw32(ip->i_ffs1_ib[xap->in_off],
+		    UFS_MPNEEDSWAP(ump));
+	else
+		daddr = ufs_rw64(ip->i_ffs2_ib[xap->in_off],
+		    UFS_MPNEEDSWAP(ump));
+
+	for (bp = NULL, ++xap; --num; ++xap) {
+		/*
+		 * Exit the loop if there is no disk address assigned yet and
+		 * the indirect block isn't in the cache, or if we were
+		 * looking for an indirect block and we've found it.
+		 */
+
+		metalbn = xap->in_lbn;
+		if (metalbn == bn)
+			break;
+		if (daddr == 0) {
+			mutex_enter(&bufcache_lock);
+			cbp = incore(vp, metalbn);
+			mutex_exit(&bufcache_lock);
+			if (cbp == NULL)
+				break;
+		}
+
+		/*
+		 * If we get here, we've either got the block in the cache
+		 * or we have a disk address for it, go fetch it.
+		 */
+		if (bp)
+			brelse(bp, 0);
+
+		xap->in_exists = 1;
+		bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
+		if (bp == NULL) {
+
+			/*
+			 * getblk() above returns NULL only iff we are
+			 * pagedaemon.  See the implementation of getblk
+			 * for detail.
+			 */
+
+			return (ENOMEM);
+		}
+		if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+			trace(TR_BREADHIT, pack(vp, size), metalbn);
+		}
+#ifdef DIAGNOSTIC
+		else if (!daddr)
+			panic("ufs_bmaparray: indirect block not in cache");
+#endif
+		else {
+			trace(TR_BREADMISS, pack(vp, size), metalbn);
+			bp->b_blkno = blkptrtodb(ump, daddr);
+			bp->b_flags |= B_READ;
+			BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+			VOP_STRATEGY(vp, bp);
+			curlwp->l_ru.ru_inblock++;	/* XXX */
+			if ((error = biowait(bp)) != 0) {
+				brelse(bp, 0);
+				return (error);
+			}
+		}
+		if (ump->um_fstype == UFS1) {
+			daddr = ufs_rw32(((u_int32_t *)bp->b_data)[xap->in_off],
+			    UFS_MPNEEDSWAP(ump));
+			if (num == 1 && daddr && runp) {
+				for (bn = xap->in_off + 1;
+				    bn < MNINDIR(ump) && *runp < maxrun &&
+				    is_sequential(ump,
+				        ufs_rw32(((int32_t *)bp->b_data)[bn-1],
+				            UFS_MPNEEDSWAP(ump)),
+				        ufs_rw32(((int32_t *)bp->b_data)[bn],
+				            UFS_MPNEEDSWAP(ump)));
+				    ++bn, ++*runp);
+			}
+		} else {
+			daddr = ufs_rw64(((u_int64_t *)bp->b_data)[xap->in_off],
+			    UFS_MPNEEDSWAP(ump));
+			if (num == 1 && daddr && runp) {
+				for (bn = xap->in_off + 1;
+				    bn < MNINDIR(ump) && *runp < maxrun &&
+				    is_sequential(ump,
+				        ufs_rw64(((int64_t *)bp->b_data)[bn-1],
+				            UFS_MPNEEDSWAP(ump)),
+				        ufs_rw64(((int64_t *)bp->b_data)[bn],
+				            UFS_MPNEEDSWAP(ump)));
+				    ++bn, ++*runp);
+			}
+		}
+	}
+	if (bp)
+		brelse(bp, 0);
+
+	/*
+	 * Since this is FFS independent code, we are out of scope for the
+	 * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they
+	 * will fall in the range 1..um_seqinc, so we use that test and
+	 * return a request for a zeroed out buffer if attempts are made
+	 * to read a BLK_NOCOPY or BLK_SNAP block.
+	 */
+	if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT
+	    && daddr > 0 && daddr < ump->um_seqinc) {
+		*bnp = -1;
+		return (0);
+	}
+	*bnp = blkptrtodb(ump, daddr);
+	if (*bnp == 0) {
+		if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))
+		    == SF_SNAPSHOT) {
+			*bnp = blkptrtodb(ump, bn * ump->um_seqinc);
+		} else {
+			*bnp = -1;
+		}
+	}
+	return (0);
+}
+
+/*
+ * Create an array of logical block number/offset pairs which represent the
+ * path of indirect blocks required to access a data block.  The first "pair"
+ * contains the logical block number of the appropriate single, double or
+ * triple indirect block and the offset into the inode indirect block array.
+ * Note, the logical block number of the inode single/double/triple indirect
+ * block appears twice in the array, once with the offset into the i_ffs1_ib and
+ * once with the offset into the page itself.
+ */
+int
+ufs_getlbns(struct vnode *vp, daddr_t bn, struct indir *ap, int *nump)
+{
+	daddr_t metalbn, realbn;
+	struct ufsmount *ump;
+	int64_t blockcnt;
+	int lbc;
+	int i, numlevels, off;
+
+	ump = VFSTOUFS(vp->v_mount);
+	if (nump)
+		*nump = 0;
+	numlevels = 0;
+	realbn = bn;
+	if (bn < 0)
+		bn = -bn;
+	KASSERT(bn >= NDADDR);
+
+	/*
+	 * Determine the number of levels of indirection.  After this loop
+	 * is done, blockcnt indicates the number of data blocks possible
+	 * at the given level of indirection, and NIADDR - i is the number
+	 * of levels of indirection needed to locate the requested block.
+	 */
+
+	bn -= NDADDR;
+	for (lbc = 0, i = NIADDR;; i--, bn -= blockcnt) {
+		if (i == 0)
+			return (EFBIG);
+
+		lbc += ump->um_lognindir;
+		blockcnt = (int64_t)1 << lbc;
+
+		if (bn < blockcnt)
+			break;
+	}
+
+	/* Calculate the address of the first meta-block. */
+	metalbn = -((realbn >= 0 ? realbn : -realbn) - bn + NIADDR - i);
+
+	/*
+	 * At each iteration, off is the offset into the bap array which is
+	 * an array of disk addresses at the current level of indirection.
+	 * The logical block number and the offset in that block are stored
+	 * into the argument array.
+	 */
+	ap->in_lbn = metalbn;
+	ap->in_off = off = NIADDR - i;
+	ap->in_exists = 0;
+	ap++;
+	for (++numlevels; i <= NIADDR; i++) {
+		/* If searching for a meta-data block, quit when found. */
+		if (metalbn == realbn)
+			break;
+
+		lbc -= ump->um_lognindir;
+		off = (bn >> lbc) & (MNINDIR(ump) - 1);
+
+		++numlevels;
+		ap->in_lbn = metalbn;
+		ap->in_off = off;
+		ap->in_exists = 0;
+		++ap;
+
+		metalbn -= -1 + ((int64_t)off << lbc);
+	}
+	if (nump)
+		*nump = numlevels;
+	return (0);
+}
diff --git a/include/ufs/ufs/ufs_bswap.h b/sys/ufs/ufs/ufs_bswap.h
similarity index 100%
rename from include/ufs/ufs/ufs_bswap.h
rename to sys/ufs/ufs/ufs_bswap.h
diff --git a/sys/ufs/ufs/ufs_dirhash.c b/sys/ufs/ufs/ufs_dirhash.c
new file mode 100644
index 000000000..e893a93f0
--- /dev/null
+++ b/sys/ufs/ufs/ufs_dirhash.c
@@ -0,0 +1,1171 @@
+/*	$NetBSD: ufs_dirhash.c,v 1.34 2009/10/05 23:48:08 rmind Exp $	*/
+
+/*
+ * Copyright (c) 2001, 2002 Ian Dowse.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.3.2.8 2004/12/08 11:54:13 dwmalone Exp $
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_dirhash.c,v 1.34 2009/10/05 23:48:08 rmind Exp $");
+
+/*
+ * This implements a hash-based lookup scheme for UFS directories.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/kmem.h>
+#include <sys/types.h>
+#include <sys/hash.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/pool.h>
+#include <sys/sysctl.h>
+#include <sys/atomic.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/dirhash.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#define WRAPINCR(val, limit)	(((val) + 1 == (limit)) ? 0 : ((val) + 1))
+#define WRAPDECR(val, limit)	(((val) == 0) ? ((limit) - 1) : ((val) - 1))
+#define OFSFMT(ip)		((ip)->i_ump->um_maxsymlinklen <= 0)
+#define BLKFREE2IDX(n)		((n) > DH_NFSTATS ? DH_NFSTATS : (n))
+
+static u_int ufs_dirhashminblks = 5;
+static u_int ufs_dirhashmaxmem = 2 * 1024 * 1024;
+static u_int ufs_dirhashmem;
+static u_int ufs_dirhashcheck = 0;
+
+static int ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen);
+static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff,
+	   int dirblksiz);
+static void ufsdirhash_delslot(struct dirhash *dh, int slot);
+static int ufsdirhash_findslot(struct dirhash *dh, const char *name,
+	   int namelen, doff_t offset);
+static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset,
+	   int dirblksiz);
+static int ufsdirhash_recycle(int wanted);
+
+static pool_cache_t ufsdirhashblk_cache;
+static pool_cache_t ufsdirhash_cache;
+
+#define DIRHASHLIST_LOCK()		mutex_enter(&ufsdirhash_lock)
+#define DIRHASHLIST_UNLOCK()		mutex_exit(&ufsdirhash_lock)
+#define DIRHASH_LOCK(dh)		mutex_enter(&(dh)->dh_lock)
+#define DIRHASH_UNLOCK(dh)		mutex_exit(&(dh)->dh_lock)
+#define DIRHASH_BLKALLOC()		\
+    pool_cache_get(ufsdirhashblk_cache, PR_NOWAIT)
+#define DIRHASH_BLKFREE(ptr)		\
+    pool_cache_put(ufsdirhashblk_cache, ptr)
+
+/* Dirhash list; recently-used entries are near the tail. */
+static TAILQ_HEAD(, dirhash) ufsdirhash_list;
+
+/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
+static kmutex_t ufsdirhash_lock;
+
+static struct sysctllog *ufsdirhash_sysctl_log;
+
+/*
+ * Locking order:
+ *	ufsdirhash_lock
+ *	dh_lock
+ *
+ * The dh_lock mutex should be acquired either via the inode lock, or via
+ * ufsdirhash_lock. Only the owner of the inode may free the associated
+ * dirhash, but anything can steal its memory and set dh_hash to NULL.
+ */
+
+/*
+ * Attempt to build up a hash table for the directory contents in
+ * inode 'ip'. Returns 0 on success, or -1 of the operation failed.
+ */
+int
+ufsdirhash_build(struct inode *ip)
+{
+	struct dirhash *dh;
+	struct buf *bp = NULL;
+	struct direct *ep;
+	struct vnode *vp;
+	doff_t bmask, pos;
+	int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
+	const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+	int dirblksiz = ip->i_ump->um_dirblksiz;
+
+	/* Check if we can/should use dirhash. */
+	if (ip->i_dirhash == NULL) {
+		if (ip->i_size < (ufs_dirhashminblks * dirblksiz) || OFSFMT(ip))
+			return (-1);
+	} else {
+		/* Hash exists, but sysctls could have changed. */
+		if (ip->i_size < (ufs_dirhashminblks * dirblksiz) ||
+		    ufs_dirhashmem > ufs_dirhashmaxmem) {
+			ufsdirhash_free(ip);
+			return (-1);
+		}
+		/* Check if hash exists and is intact (note: unlocked read). */
+		if (ip->i_dirhash->dh_hash != NULL)
+			return (0);
+		/* Free the old, recycled hash and build a new one. */
+		ufsdirhash_free(ip);
+	}
+
+	/* Don't hash removed directories. */
+	if (ip->i_nlink == 0)
+		return (-1);
+
+	vp = ip->i_vnode;
+	/* Allocate 50% more entries than this dir size could ever need. */
+	KASSERT(ip->i_size >= dirblksiz);
+	nslots = ip->i_size / DIRECTSIZ(1);
+	nslots = (nslots * 3 + 1) / 2;
+	narrays = howmany(nslots, DH_NBLKOFF);
+	nslots = narrays * DH_NBLKOFF;
+	dirblocks = howmany(ip->i_size, dirblksiz);
+	nblocks = (dirblocks * 3 + 1) / 2;
+
+	memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
+	    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
+	    nblocks * sizeof(*dh->dh_blkfree);
+
+	while (atomic_add_int_nv(&ufs_dirhashmem, memreqd) >
+	    ufs_dirhashmaxmem) {
+		atomic_add_int(&ufs_dirhashmem, -memreqd);
+		if (memreqd > ufs_dirhashmaxmem / 2)
+			return (-1);
+		/* Try to free some space. */
+		if (ufsdirhash_recycle(memreqd) != 0)
+			return (-1);
+	        else
+		    	DIRHASHLIST_UNLOCK();
+	}
+
+	/*
+	 * Use non-blocking mallocs so that we will revert to a linear
+	 * lookup on failure rather than potentially blocking forever.
+	 */
+	dh = pool_cache_get(ufsdirhash_cache, PR_NOWAIT);
+	if (dh == NULL) {
+		atomic_add_int(&ufs_dirhashmem, -memreqd);
+		return (-1);
+	}
+	memset(dh, 0, sizeof(*dh));
+	mutex_init(&dh->dh_lock, MUTEX_DEFAULT, IPL_NONE);
+	DIRHASH_LOCK(dh);
+	dh->dh_hashsz = narrays * sizeof(dh->dh_hash[0]);
+	dh->dh_hash = kmem_zalloc(dh->dh_hashsz, KM_NOSLEEP);
+	dh->dh_blkfreesz = nblocks * sizeof(dh->dh_blkfree[0]);
+	dh->dh_blkfree = kmem_zalloc(dh->dh_blkfreesz, KM_NOSLEEP);
+	if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
+		goto fail;
+	for (i = 0; i < narrays; i++) {
+		if ((dh->dh_hash[i] = DIRHASH_BLKALLOC()) == NULL)
+			goto fail;
+		for (j = 0; j < DH_NBLKOFF; j++)
+			dh->dh_hash[i][j] = DIRHASH_EMPTY;
+	}
+
+	/* Initialise the hash table and block statistics. */
+	dh->dh_narrays = narrays;
+	dh->dh_hlen = nslots;
+	dh->dh_nblk = nblocks;
+	dh->dh_dirblks = dirblocks;
+	for (i = 0; i < dirblocks; i++)
+		dh->dh_blkfree[i] = dirblksiz / DIRALIGN;
+	for (i = 0; i < DH_NFSTATS; i++)
+		dh->dh_firstfree[i] = -1;
+	dh->dh_firstfree[DH_NFSTATS] = 0;
+	dh->dh_seqopt = 0;
+	dh->dh_seqoff = 0;
+	dh->dh_score = DH_SCOREINIT;
+	ip->i_dirhash = dh;
+
+	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
+	pos = 0;
+	while (pos < ip->i_size) {
+		if ((curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD)
+		    != 0) {
+			preempt();
+		}
+		/* If necessary, get the next directory block. */
+		if ((pos & bmask) == 0) {
+			if (bp != NULL)
+				brelse(bp, 0);
+			if (ufs_blkatoff(vp, (off_t)pos, NULL, &bp, false) != 0)
+				goto fail;
+		}
+
+		/* Add this entry to the hash. */
+		ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
+		if (ep->d_reclen == 0 || ep->d_reclen >
+		    dirblksiz - (pos & (dirblksiz - 1))) {
+			/* Corrupted directory. */
+			brelse(bp, 0);
+			goto fail;
+		}
+		if (ep->d_ino != 0) {
+			/* Add the entry (simplified ufsdirhash_add). */
+			slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen);
+			while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
+				slot = WRAPINCR(slot, dh->dh_hlen);
+			dh->dh_hused++;
+			DH_ENTRY(dh, slot) = pos;
+			ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep, needswap),
+			    dirblksiz);
+		}
+		pos += ep->d_reclen;
+	}
+
+	if (bp != NULL)
+		brelse(bp, 0);
+	DIRHASHLIST_LOCK();
+	TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
+	dh->dh_onlist = 1;
+	DIRHASH_UNLOCK(dh);
+	DIRHASHLIST_UNLOCK();
+	return (0);
+
+fail:
+	DIRHASH_UNLOCK(dh);
+	if (dh->dh_hash != NULL) {
+		for (i = 0; i < narrays; i++)
+			if (dh->dh_hash[i] != NULL)
+				DIRHASH_BLKFREE(dh->dh_hash[i]);
+		kmem_free(dh->dh_hash, dh->dh_hashsz);
+	}
+	if (dh->dh_blkfree != NULL)
+		kmem_free(dh->dh_blkfree, dh->dh_blkfreesz);
+	mutex_destroy(&dh->dh_lock);
+	pool_cache_put(ufsdirhash_cache, dh);
+	ip->i_dirhash = NULL;
+	atomic_add_int(&ufs_dirhashmem, -memreqd);
+	return (-1);
+}
+
+/*
+ * Free any hash table associated with inode 'ip'.
+ */
+void
+ufsdirhash_free(struct inode *ip)
+{
+	struct dirhash *dh;
+	int i, mem;
+
+	if ((dh = ip->i_dirhash) == NULL)
+		return;
+
+	if (dh->dh_onlist) {
+		DIRHASHLIST_LOCK();
+		if (dh->dh_onlist)
+			TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
+		DIRHASHLIST_UNLOCK();
+	}
+
+	/* The dirhash pointed to by 'dh' is exclusively ours now. */
+	mem = sizeof(*dh);
+	if (dh->dh_hash != NULL) {
+		for (i = 0; i < dh->dh_narrays; i++)
+			DIRHASH_BLKFREE(dh->dh_hash[i]);
+		kmem_free(dh->dh_hash, dh->dh_hashsz);
+		kmem_free(dh->dh_blkfree, dh->dh_blkfreesz);
+		mem += dh->dh_hashsz;
+		mem += dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash);
+		mem += dh->dh_nblk * sizeof(*dh->dh_blkfree);
+	}
+	mutex_destroy(&dh->dh_lock);
+	pool_cache_put(ufsdirhash_cache, dh);
+	ip->i_dirhash = NULL;
+
+	atomic_add_int(&ufs_dirhashmem, -mem);
+}
+
+/*
+ * Find the offset of the specified name within the given inode.
+ * Returns 0 on success, ENOENT if the entry does not exist, or
+ * EJUSTRETURN if the caller should revert to a linear search.
+ *
+ * If successful, the directory offset is stored in *offp, and a
+ * pointer to a struct buf containing the entry is stored in *bpp. If
+ * prevoffp is non-NULL, the offset of the previous entry within
+ * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
+ * is the first in a block, the start of the block is used).
+ */
+int
+ufsdirhash_lookup(struct inode *ip, const char *name, int namelen, doff_t *offp,
+    struct buf **bpp, doff_t *prevoffp)
+{
+	struct dirhash *dh, *dh_next;
+	struct direct *dp;
+	struct vnode *vp;
+	struct buf *bp;
+	doff_t blkoff, bmask, offset, prevoff;
+	int i, slot;
+	const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+	int dirblksiz = ip->i_ump->um_dirblksiz;
+
+	if ((dh = ip->i_dirhash) == NULL)
+		return (EJUSTRETURN);
+
+	/*
+	 * Move this dirhash towards the end of the list if it has a
+	 * score higher than the next entry, and acquire the dh_lock.
+	 * Optimise the case where it's already the last by performing
+	 * an unlocked read of the TAILQ_NEXT pointer.
+	 *
+	 * In both cases, end up holding just dh_lock.
+	 */
+	if (TAILQ_NEXT(dh, dh_list) != NULL) {
+		DIRHASHLIST_LOCK();
+		DIRHASH_LOCK(dh);
+		/*
+		 * If the new score will be greater than that of the next
+		 * entry, then move this entry past it. With both mutexes
+		 * held, dh_next won't go away, but its dh_score could
+		 * change; that's not important since it is just a hint.
+		 */
+		if (dh->dh_hash != NULL &&
+		    (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
+		    dh->dh_score >= dh_next->dh_score) {
+			KASSERT(dh->dh_onlist);
+			TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
+			TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
+			    dh_list);
+		}
+		DIRHASHLIST_UNLOCK();
+	} else {
+		/* Already the last, though that could change as we wait. */
+		DIRHASH_LOCK(dh);
+	}
+	if (dh->dh_hash == NULL) {
+		DIRHASH_UNLOCK(dh);
+		ufsdirhash_free(ip);
+		return (EJUSTRETURN);
+	}
+
+	/* Update the score. */
+	if (dh->dh_score < DH_SCOREMAX)
+		dh->dh_score++;
+
+	vp = ip->i_vnode;
+	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
+	blkoff = -1;
+	bp = NULL;
+restart:
+	slot = ufsdirhash_hash(dh, name, namelen);
+
+	if (dh->dh_seqopt) {
+		/*
+		 * Sequential access optimisation. dh_seqoff contains the
+		 * offset of the directory entry immediately following
+		 * the last entry that was looked up. Check if this offset
+		 * appears in the hash chain for the name we are looking for.
+		 */
+		for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
+		    i = WRAPINCR(i, dh->dh_hlen))
+			if (offset == dh->dh_seqoff)
+				break;
+		if (offset == dh->dh_seqoff) {
+			/*
+			 * We found an entry with the expected offset. This
+			 * is probably the entry we want, but if not, the
+			 * code below will turn off seqoff and retry.
+			 */
+			slot = i;
+		} else
+			dh->dh_seqopt = 0;
+	}
+
+	for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY;
+	    slot = WRAPINCR(slot, dh->dh_hlen)) {
+		if (offset == DIRHASH_DEL)
+			continue;
+
+		if (offset < 0 || offset >= ip->i_size)
+			panic("ufsdirhash_lookup: bad offset in hash array");
+		if ((offset & ~bmask) != blkoff) {
+			if (bp != NULL)
+				brelse(bp, 0);
+			blkoff = offset & ~bmask;
+			if (ufs_blkatoff(vp, (off_t)blkoff,
+			    NULL, &bp, false) != 0) {
+				DIRHASH_UNLOCK(dh);
+				return (EJUSTRETURN);
+			}
+		}
+		dp = (struct direct *)((char *)bp->b_data + (offset & bmask));
+		if (dp->d_reclen == 0 || dp->d_reclen >
+		    dirblksiz - (offset & (dirblksiz - 1))) {
+			/* Corrupted directory. */
+			DIRHASH_UNLOCK(dh);
+			brelse(bp, 0);
+			return (EJUSTRETURN);
+		}
+		if (dp->d_namlen == namelen &&
+		    memcmp(dp->d_name, name, namelen) == 0) {
+			/* Found. Get the prev offset if needed. */
+			if (prevoffp != NULL) {
+				if (offset & (dirblksiz - 1)) {
+					prevoff = ufsdirhash_getprev(dp,
+					    offset, dirblksiz);
+					if (prevoff == -1) {
+						brelse(bp, 0);
+						return (EJUSTRETURN);
+					}
+				} else
+					prevoff = offset;
+				*prevoffp = prevoff;
+			}
+
+			/* Check for sequential access, and update offset. */
+			if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
+				dh->dh_seqopt = 1;
+			dh->dh_seqoff = offset + DIRSIZ(0, dp, needswap);
+			DIRHASH_UNLOCK(dh);
+
+			*bpp = bp;
+			*offp = offset;
+			return (0);
+		}
+
+		if (dh->dh_hash == NULL) {
+			DIRHASH_UNLOCK(dh);
+			if (bp != NULL)
+				brelse(bp, 0);
+			ufsdirhash_free(ip);
+			return (EJUSTRETURN);
+		}
+		/*
+		 * When the name doesn't match in the seqopt case, go back
+		 * and search normally.
+		 */
+		if (dh->dh_seqopt) {
+			dh->dh_seqopt = 0;
+			goto restart;
+		}
+	}
+	DIRHASH_UNLOCK(dh);
+	if (bp != NULL)
+		brelse(bp, 0);
+	return (ENOENT);
+}
+
+/*
+ * Find a directory block with room for 'slotneeded' bytes. Returns
+ * the offset of the directory entry that begins the free space.
+ * This will either be the offset of an existing entry that has free
+ * space at the end, or the offset of an entry with d_ino == 0 at
+ * the start of a DIRBLKSIZ block.
+ *
+ * To use the space, the caller may need to compact existing entries in
+ * the directory. The total number of bytes in all of the entries involved
+ * in the compaction is stored in *slotsize. In other words, all of
+ * the entries that must be compacted are exactly contained in the
+ * region beginning at the returned offset and spanning *slotsize bytes.
+ *
+ * Returns -1 if no space was found, indicating that the directory
+ * must be extended.
+ */
+doff_t
+ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
+{
+	struct direct *dp;
+	struct dirhash *dh;
+	struct buf *bp;
+	doff_t pos, slotstart;
+	int dirblock, error, freebytes, i;
+	const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+	int dirblksiz = ip->i_ump->um_dirblksiz;
+
+	if ((dh = ip->i_dirhash) == NULL)
+		return (-1);
+
+	DIRHASH_LOCK(dh);
+	if (dh->dh_hash == NULL) {
+		DIRHASH_UNLOCK(dh);
+		ufsdirhash_free(ip);
+		return (-1);
+	}
+
+	/* Find a directory block with the desired free space. */
+	dirblock = -1;
+	for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
+		if ((dirblock = dh->dh_firstfree[i]) != -1)
+			break;
+	if (dirblock == -1) {
+		DIRHASH_UNLOCK(dh);
+		return (-1);
+	}
+
+	KASSERT(dirblock < dh->dh_nblk &&
+	    dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN));
+	pos = dirblock * dirblksiz;
+	error = ufs_blkatoff(ip->i_vnode, (off_t)pos, (void *)&dp, &bp, false);
+	if (error) {
+		DIRHASH_UNLOCK(dh);
+		return (-1);
+	}
+	/* Find the first entry with free space. */
+	for (i = 0; i < dirblksiz; ) {
+		if (dp->d_reclen == 0) {
+			DIRHASH_UNLOCK(dh);
+			brelse(bp, 0);
+			return (-1);
+		}
+		if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp, needswap))
+			break;
+		i += dp->d_reclen;
+		dp = (struct direct *)((char *)dp + dp->d_reclen);
+	}
+	if (i > dirblksiz) {
+		DIRHASH_UNLOCK(dh);
+		brelse(bp, 0);
+		return (-1);
+	}
+	slotstart = pos + i;
+
+	/* Find the range of entries needed to get enough space */
+	freebytes = 0;
+	while (i < dirblksiz && freebytes < slotneeded) {
+		freebytes += dp->d_reclen;
+		if (dp->d_ino != 0)
+			freebytes -= DIRSIZ(0, dp, needswap);
+		if (dp->d_reclen == 0) {
+			DIRHASH_UNLOCK(dh);
+			brelse(bp, 0);
+			return (-1);
+		}
+		i += dp->d_reclen;
+		dp = (struct direct *)((char *)dp + dp->d_reclen);
+	}
+	if (i > dirblksiz) {
+		DIRHASH_UNLOCK(dh);
+		brelse(bp, 0);
+		return (-1);
+	}
+	if (freebytes < slotneeded)
+		panic("ufsdirhash_findfree: free mismatch");
+	DIRHASH_UNLOCK(dh);
+	brelse(bp, 0);
+	*slotsize = pos + i - slotstart;
+	return (slotstart);
+}
+
+/*
+ * Return the start of the unused space at the end of a directory, or
+ * -1 if there are no trailing unused blocks.
+ */
+doff_t
+ufsdirhash_enduseful(struct inode *ip)
+{
+	struct dirhash *dh;
+	int i;
+	int dirblksiz = ip->i_ump->um_dirblksiz;
+
+	if ((dh = ip->i_dirhash) == NULL)
+		return (-1);
+
+	DIRHASH_LOCK(dh);
+	if (dh->dh_hash == NULL) {
+		DIRHASH_UNLOCK(dh);
+		ufsdirhash_free(ip);
+		return (-1);
+	}
+
+	if (dh->dh_blkfree[dh->dh_dirblks - 1] != dirblksiz / DIRALIGN) {
+		DIRHASH_UNLOCK(dh);
+		return (-1);
+	}
+
+	for (i = dh->dh_dirblks - 1; i >= 0; i--)
+		if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN)
+			break;
+	DIRHASH_UNLOCK(dh);
+	return ((doff_t)(i + 1) * dirblksiz);
+}
+
+/*
+ * Insert information into the hash about a new directory entry. dirp
+ * points to a struct direct containing the entry, and offset specifies
+ * the offset of this entry.
+ */
+void
+ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
+{
+	struct dirhash *dh;
+	int slot;
+	const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+	int dirblksiz = ip->i_ump->um_dirblksiz;
+
+	if ((dh = ip->i_dirhash) == NULL)
+		return;
+
+	DIRHASH_LOCK(dh);
+	if (dh->dh_hash == NULL) {
+		DIRHASH_UNLOCK(dh);
+		ufsdirhash_free(ip);
+		return;
+	}
+
+	KASSERT(offset < dh->dh_dirblks * dirblksiz);
+	/*
+	 * Normal hash usage is < 66%. If the usage gets too high then
+	 * remove the hash entirely and let it be rebuilt later.
+	 */
+	if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
+		DIRHASH_UNLOCK(dh);
+		ufsdirhash_free(ip);
+		return;
+	}
+
+	/* Find a free hash slot (empty or deleted), and add the entry. */
+	slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
+	while (DH_ENTRY(dh, slot) >= 0)
+		slot = WRAPINCR(slot, dh->dh_hlen);
+	if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
+		dh->dh_hused++;
+	DH_ENTRY(dh, slot) = offset;
+
+	/* Update the per-block summary info. */
+	ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp, needswap), dirblksiz);
+	DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Remove the specified directory entry from the hash. The entry to remove
+ * is defined by the name in `dirp', which must exist at the specified
+ * `offset' within the directory.
+ */
+void
+ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
+{
+	struct dirhash *dh;
+	int slot;
+	const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+	int dirblksiz = ip->i_ump->um_dirblksiz;
+
+	if ((dh = ip->i_dirhash) == NULL)
+		return;
+
+	DIRHASH_LOCK(dh);
+	if (dh->dh_hash == NULL) {
+		DIRHASH_UNLOCK(dh);
+		ufsdirhash_free(ip);
+		return;
+	}
+
+	KASSERT(offset < dh->dh_dirblks * dirblksiz);
+	/* Find the entry */
+	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
+
+	/* Remove the hash entry. */
+	ufsdirhash_delslot(dh, slot);
+
+	/* Update the per-block summary info. */
+	ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp, needswap), dirblksiz);
+	DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Change the offset associated with a directory entry in the hash. Used
+ * when compacting directory blocks.
+ */
+void
+ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
+    doff_t newoff)
+{
+	struct dirhash *dh;
+	int slot;
+
+	if ((dh = ip->i_dirhash) == NULL)
+		return;
+	DIRHASH_LOCK(dh);
+	if (dh->dh_hash == NULL) {
+		DIRHASH_UNLOCK(dh);
+		ufsdirhash_free(ip);
+		return;
+	}
+
+	KASSERT(oldoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz &&
+	    newoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz);
+	/* Find the entry, and update the offset. */
+	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
+	DH_ENTRY(dh, slot) = newoff;
+	DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Inform dirhash that the directory has grown by one block that
+ * begins at offset (i.e. the new length is offset + DIRBLKSIZ).
+ */
+void
+ufsdirhash_newblk(struct inode *ip, doff_t offset)
+{
+	struct dirhash *dh;
+	int block;
+	int dirblksiz = ip->i_ump->um_dirblksiz;
+
+	if ((dh = ip->i_dirhash) == NULL)
+		return;
+	DIRHASH_LOCK(dh);
+	if (dh->dh_hash == NULL) {
+		DIRHASH_UNLOCK(dh);
+		ufsdirhash_free(ip);
+		return;
+	}
+
+	KASSERT(offset == dh->dh_dirblks * dirblksiz);
+	block = offset / dirblksiz;
+	if (block >= dh->dh_nblk) {
+		/* Out of space; must rebuild. */
+		DIRHASH_UNLOCK(dh);
+		ufsdirhash_free(ip);
+		return;
+	}
+	dh->dh_dirblks = block + 1;
+
+	/* Account for the new free block. */
+	dh->dh_blkfree[block] = dirblksiz / DIRALIGN;
+	if (dh->dh_firstfree[DH_NFSTATS] == -1)
+		dh->dh_firstfree[DH_NFSTATS] = block;
+	DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Inform dirhash that the directory is being truncated.
+ */
+void
+ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
+{
+	struct dirhash *dh;
+	int block, i;
+	int dirblksiz = ip->i_ump->um_dirblksiz;
+
+	if ((dh = ip->i_dirhash) == NULL)
+		return;
+
+	DIRHASH_LOCK(dh);
+	if (dh->dh_hash == NULL) {
+		DIRHASH_UNLOCK(dh);
+		ufsdirhash_free(ip);
+		return;
+	}
+
+	KASSERT(offset <= dh->dh_dirblks * dirblksiz);
+	block = howmany(offset, dirblksiz);
+	/*
+	 * If the directory shrinks to less than 1/8 of dh_nblk blocks
+	 * (about 20% of its original size due to the 50% extra added in
+	 * ufsdirhash_build) then free it, and let the caller rebuild
+	 * if necessary.
+	 */
+	if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
+		DIRHASH_UNLOCK(dh);
+		ufsdirhash_free(ip);
+		return;
+	}
+
+	/*
+	 * Remove any `first free' information pertaining to the
+	 * truncated blocks. All blocks we're removing should be
+	 * completely unused.
+	 */
+	if (dh->dh_firstfree[DH_NFSTATS] >= block)
+		dh->dh_firstfree[DH_NFSTATS] = -1;
+	for (i = block; i < dh->dh_dirblks; i++)
+		if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN)
+			panic("ufsdirhash_dirtrunc: blocks in use");
+	for (i = 0; i < DH_NFSTATS; i++)
+		if (dh->dh_firstfree[i] >= block)
+			panic("ufsdirhash_dirtrunc: first free corrupt");
+	dh->dh_dirblks = block;
+	DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Debugging function to check that the dirhash information about
+ * a directory block matches its actual contents. Panics if a mismatch
+ * is detected.
+ *
+ * On entry, `sbuf' should point to the start of an in-core
+ * DIRBLKSIZ-sized directory block, and `offset' should contain the
+ * offset from the start of the directory of that block.
+ */
+void
+ufsdirhash_checkblock(struct inode *ip, char *sbuf, doff_t offset)
+{
+	struct dirhash *dh;
+	struct direct *dp;
+	int block, ffslot, i, nfree;
+	const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+	int dirblksiz = ip->i_ump->um_dirblksiz;
+
+	if (!ufs_dirhashcheck)
+		return;
+	if ((dh = ip->i_dirhash) == NULL)
+		return;
+
+	DIRHASH_LOCK(dh);
+	if (dh->dh_hash == NULL) {
+		DIRHASH_UNLOCK(dh);
+		ufsdirhash_free(ip);
+		return;
+	}
+
+	block = offset / dirblksiz;
+	if ((offset & (dirblksiz - 1)) != 0 || block >= dh->dh_dirblks)
+		panic("ufsdirhash_checkblock: bad offset");
+
+	nfree = 0;
+	for (i = 0; i < dirblksiz; i += dp->d_reclen) {
+		dp = (struct direct *)(sbuf + i);
+		if (dp->d_reclen == 0 || i + dp->d_reclen > dirblksiz)
+			panic("ufsdirhash_checkblock: bad dir");
+
+		if (dp->d_ino == 0) {
+#if 0
+			/*
+			 * XXX entries with d_ino == 0 should only occur
+			 * at the start of a DIRBLKSIZ block. However the
+			 * ufs code is tolerant of such entries at other
+			 * offsets, and fsck does not fix them.
+			 */
+			if (i != 0)
+				panic("ufsdirhash_checkblock: bad dir inode");
+#endif
+			nfree += dp->d_reclen;
+			continue;
+		}
+
+		/* Check that the entry	exists (will panic if it doesn't). */
+		ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
+
+		nfree += dp->d_reclen - DIRSIZ(0, dp, needswap);
+	}
+	if (i != dirblksiz)
+		panic("ufsdirhash_checkblock: bad dir end");
+
+	if (dh->dh_blkfree[block] * DIRALIGN != nfree)
+		panic("ufsdirhash_checkblock: bad free count");
+
+	ffslot = BLKFREE2IDX(nfree / DIRALIGN);
+	for (i = 0; i <= DH_NFSTATS; i++)
+		if (dh->dh_firstfree[i] == block && i != ffslot)
+			panic("ufsdirhash_checkblock: bad first-free");
+	if (dh->dh_firstfree[ffslot] == -1)
+		panic("ufsdirhash_checkblock: missing first-free entry");
+	DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Hash the specified filename into a dirhash slot.
+ */
+static int
+ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen)
+{
+	u_int32_t hash;
+
+	/*
+	 * We hash the name and then some other bit of data that is
+	 * invariant over the dirhash's lifetime. Otherwise names
+	 * differing only in the last byte are placed close to one
+	 * another in the table, which is bad for linear probing.
+	 */
+	hash = hash32_buf(name, namelen, HASH32_BUF_INIT);
+	hash = hash32_buf(&dh, sizeof(dh), hash);
+	return (hash % dh->dh_hlen);
+}
+
+/*
+ * Adjust the number of free bytes in the block containing `offset'
+ * by the value specified by `diff'.
+ *
+ * The caller must ensure we have exclusive access to `dh'; normally
+ * that means that dh_lock should be held, but this is also called
+ * from ufsdirhash_build() where exclusive access can be assumed.
+ */
+static void
+ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff, int dirblksiz)
+{
+	int block, i, nfidx, ofidx;
+
+	KASSERT(mutex_owned(&dh->dh_lock));
+
+	/* Update the per-block summary info. */
+	block = offset / dirblksiz;
+	KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks);
+	ofidx = BLKFREE2IDX(dh->dh_blkfree[block]);
+	dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
+	nfidx = BLKFREE2IDX(dh->dh_blkfree[block]);
+
+	/* Update the `first free' list if necessary. */
+	if (ofidx != nfidx) {
+		/* If removing, scan forward for the next block. */
+		if (dh->dh_firstfree[ofidx] == block) {
+			for (i = block + 1; i < dh->dh_dirblks; i++)
+				if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx)
+					break;
+			dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
+		}
+
+		/* Make this the new `first free' if necessary */
+		if (dh->dh_firstfree[nfidx] > block ||
+		    dh->dh_firstfree[nfidx] == -1)
+			dh->dh_firstfree[nfidx] = block;
+	}
+}
+
+/*
+ * Find the specified name which should have the specified offset.
+ * Returns a slot number, and panics on failure.
+ *
+ * `dh' must be locked on entry and remains so on return.
+ */
+static int
+ufsdirhash_findslot(struct dirhash *dh, const char *name, int namelen,
+    doff_t offset)
+{
+	int slot;
+
+	KASSERT(mutex_owned(&dh->dh_lock));
+
+	/* Find the entry. */
+	KASSERT(dh->dh_hused < dh->dh_hlen);
+	slot = ufsdirhash_hash(dh, name, namelen);
+	while (DH_ENTRY(dh, slot) != offset &&
+	    DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
+		slot = WRAPINCR(slot, dh->dh_hlen);
+	if (DH_ENTRY(dh, slot) != offset)
+		panic("ufsdirhash_findslot: '%.*s' not found", namelen, name);
+
+	return (slot);
+}
+
+/*
+ * Remove the entry corresponding to the specified slot from the hash array.
+ *
+ * `dh' must be locked on entry and remains so on return.
+ */
+static void
+ufsdirhash_delslot(struct dirhash *dh, int slot)
+{
+	int i;
+
+	KASSERT(mutex_owned(&dh->dh_lock));
+
+	/* Mark the entry as deleted. */
+	DH_ENTRY(dh, slot) = DIRHASH_DEL;
+
+	/* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
+	for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; )
+		i = WRAPINCR(i, dh->dh_hlen);
+	if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
+		i = WRAPDECR(i, dh->dh_hlen);
+		while (DH_ENTRY(dh, i) == DIRHASH_DEL) {
+			DH_ENTRY(dh, i) = DIRHASH_EMPTY;
+			dh->dh_hused--;
+			i = WRAPDECR(i, dh->dh_hlen);
+		}
+		KASSERT(dh->dh_hused >= 0);
+	}
+}
+
+/*
+ * Given a directory entry and its offset, find the offset of the
+ * previous entry in the same DIRBLKSIZ-sized block. Returns an
+ * offset, or -1 if there is no previous entry in the block or some
+ * other problem occurred.
+ */
+static doff_t
+ufsdirhash_getprev(struct direct *dirp, doff_t offset, int dirblksiz)
+{
+	struct direct *dp;
+	char *blkbuf;
+	doff_t blkoff, prevoff;
+	int entrypos, i;
+
+	blkoff = offset & ~(dirblksiz - 1);	/* offset of start of block */
+	entrypos = offset & (dirblksiz - 1);	/* entry relative to block */
+	blkbuf = (char *)dirp - entrypos;
+	prevoff = blkoff;
+
+	/* If `offset' is the start of a block, there is no previous entry. */
+	if (entrypos == 0)
+		return (-1);
+
+	/* Scan from the start of the block until we get to the entry. */
+	for (i = 0; i < entrypos; i += dp->d_reclen) {
+		dp = (struct direct *)(blkbuf + i);
+		if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
+			return (-1);	/* Corrupted directory. */
+		prevoff = blkoff + i;
+	}
+	return (prevoff);
+}
+
+/*
+ * Try to free up `wanted' bytes by stealing memory from existing
+ * dirhashes. Returns zero with list locked if successful.
+ */
+static int
+ufsdirhash_recycle(int wanted)
+{
+	struct dirhash *dh;
+	doff_t **hash;
+	u_int8_t *blkfree;
+	int i, mem, narrays;
+	size_t hashsz, blkfreesz;
+
+	DIRHASHLIST_LOCK();
+	while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
+		/* Find a dirhash, and lock it. */
+		if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) {
+			DIRHASHLIST_UNLOCK();
+			return (-1);
+		}
+		DIRHASH_LOCK(dh);
+		KASSERT(dh->dh_hash != NULL);
+
+		/* Decrement the score; only recycle if it becomes zero. */
+		if (--dh->dh_score > 0) {
+			DIRHASH_UNLOCK(dh);
+			DIRHASHLIST_UNLOCK();
+			return (-1);
+		}
+
+		/* Remove it from the list and detach its memory. */
+		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
+		dh->dh_onlist = 0;
+		hash = dh->dh_hash;
+		hashsz = dh->dh_hashsz;
+		dh->dh_hash = NULL;
+		blkfree = dh->dh_blkfree;
+		blkfreesz = dh->dh_blkfreesz;
+		dh->dh_blkfree = NULL;
+		narrays = dh->dh_narrays;
+		mem = narrays * sizeof(*dh->dh_hash) +
+		    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
+		    dh->dh_nblk * sizeof(*dh->dh_blkfree);
+
+		/* Unlock everything, free the detached memory. */
+		DIRHASH_UNLOCK(dh);
+		DIRHASHLIST_UNLOCK();
+
+		for (i = 0; i < narrays; i++)
+			DIRHASH_BLKFREE(hash[i]);
+		kmem_free(hash, hashsz);
+		kmem_free(blkfree, blkfreesz);
+
+		/* Account for the returned memory, and repeat if necessary. */
+		DIRHASHLIST_LOCK();
+		atomic_add_int(&ufs_dirhashmem, -mem);
+	}
+	/* Success. */
+	return (0);
+}
+
+static void
+ufsdirhash_sysctl_init(void)
+{
+	const struct sysctlnode *rnode, *cnode;
+
+	sysctl_createv(&ufsdirhash_sysctl_log, 0, NULL, &rnode,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_NODE, "vfs", NULL,
+		       NULL, 0, NULL, 0,
+		       CTL_VFS, CTL_EOL);
+
+	sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &rnode,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_NODE, "ufs",
+		       SYSCTL_DESCR("ufs"),
+		       NULL, 0, NULL, 0,
+		       CTL_CREATE, CTL_EOL);
+
+	sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &rnode,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_NODE, "dirhash",
+		       SYSCTL_DESCR("dirhash"),
+		       NULL, 0, NULL, 0,
+		       CTL_CREATE, CTL_EOL);
+
+	sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode,
+		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "minblocks",
+		       SYSCTL_DESCR("minimum hashed directory size in blocks"),
+		       NULL, 0, &ufs_dirhashminblks, 0,
+		       CTL_CREATE, CTL_EOL);
+
+	sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode,
+		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "maxmem",
+		       SYSCTL_DESCR("maximum dirhash memory usage"),
+		       NULL, 0, &ufs_dirhashmaxmem, 0,
+		       CTL_CREATE, CTL_EOL);
+
+	sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode,
+		       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
+		       CTLTYPE_INT, "memused",
+		       SYSCTL_DESCR("current dirhash memory usage"),
+		       NULL, 0, &ufs_dirhashmem, 0,
+		       CTL_CREATE, CTL_EOL);
+
+	sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode,
+		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "docheck",
+		       SYSCTL_DESCR("enable extra sanity checks"),
+		       NULL, 0, &ufs_dirhashcheck, 0,
+		       CTL_CREATE, CTL_EOL);
+}
+
+void
+ufsdirhash_init(void)
+{
+
+	mutex_init(&ufsdirhash_lock, MUTEX_DEFAULT, IPL_NONE);
+	ufsdirhashblk_cache = pool_cache_init(DH_NBLKOFF * sizeof(daddr_t), 0,
+	    0, 0, "dirhashblk", NULL, IPL_NONE, NULL, NULL, NULL);
+	ufsdirhash_cache = pool_cache_init(sizeof(struct dirhash), 0,
+	    0, 0, "dirhash", NULL, IPL_NONE, NULL, NULL, NULL);
+	TAILQ_INIT(&ufsdirhash_list);
+	ufsdirhash_sysctl_init();
+}
+
+void
+ufsdirhash_done(void)
+{
+
+	KASSERT(TAILQ_EMPTY(&ufsdirhash_list));
+	pool_cache_destroy(ufsdirhashblk_cache);
+	pool_cache_destroy(ufsdirhash_cache);
+	mutex_destroy(&ufsdirhash_lock);
+	sysctl_teardown(&ufsdirhash_sysctl_log);
+}
diff --git a/sys/ufs/ufs/ufs_extattr.c b/sys/ufs/ufs/ufs_extattr.c
new file mode 100644
index 000000000..8b456b858
--- /dev/null
+++ b/sys/ufs/ufs/ufs_extattr.c
@@ -0,0 +1,1551 @@
+/*	$NetBSD: ufs_extattr.c,v 1.35 2011/07/07 14:56:45 manu Exp $	*/
+
+/*-
+ * Copyright (c) 1999-2002 Robert N. M. Watson
+ * Copyright (c) 2002-2003 Networks Associates Technology, Inc.
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * This software was developed for the FreeBSD Project in part by Network
+ * Associates Laboratories, the Security Research Division of Network
+ * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
+ * as part of the DARPA CHATS research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Support for file system extended attributes on the UFS1 file system.
+ *
+ * Extended attributes are defined in the form name=value, where name is
+ * a nul-terminated string in the style of a file name, and value is a
+ * binary blob of zero or more bytes.  The UFS1 extended attribute service
+ * layers support for extended attributes onto a backing file, in the style
+ * of the quota implementation, meaning that it requires no underlying format
+ * changes to the file system.  This design choice exchanges simplicity,
+ * usability, and easy deployment for performance.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_extattr.c,v 1.35 2011/07/07 14:56:45 manu Exp $");
+
+#ifdef _KERNEL_OPT
+#include "opt_ffs.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/reboot.h>
+#include <sys/kauth.h>
+#include <sys/kernel.h>
+#include <sys/namei.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+#include <sys/lwp.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/lock.h>
+#include <sys/dirent.h>
+#include <sys/extattr.h>
+#include <sys/sysctl.h>
+
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/extattr.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+
+static MALLOC_JUSTDEFINE(M_UFS_EXTATTR, "ufs_extattr","ufs extended attribute");
+
+int ufs_extattr_sync = 1;
+int ufs_extattr_autocreate = 1024;
+
+static int	ufs_extattr_valid_attrname(int attrnamespace,
+		    const char *attrname);
+static int	ufs_extattr_enable_with_open(struct ufsmount *ump,
+		    struct vnode *vp, int attrnamespace, const char *attrname,
+		    struct lwp *l);
+static int	ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
+		    const char *attrname, struct vnode *backing_vnode,
+		    struct lwp *l);
+static int	ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
+		    const char *attrname, struct lwp *l);
+static int	ufs_extattr_get(struct vnode *vp, int attrnamespace,
+		    const char *name, struct uio *uio, size_t *size,
+		    kauth_cred_t cred, struct lwp *l);
+static int	ufs_extattr_list(struct vnode *vp, int attrnamespace,
+		    struct uio *uio, size_t *size, int flag,
+		    kauth_cred_t cred, struct lwp *l);
+static int	ufs_extattr_set(struct vnode *vp, int attrnamespace,
+		    const char *name, struct uio *uio, kauth_cred_t cred,
+		    struct lwp *l);
+static int	ufs_extattr_rm(struct vnode *vp, int attrnamespace,
+		    const char *name, kauth_cred_t cred, struct lwp *l);
+static struct ufs_extattr_list_entry *ufs_extattr_find_attr(struct ufsmount *,
+		    int, const char *);
+static int	ufs_extattr_get_header(struct vnode *, 
+		    struct ufs_extattr_list_entry *, 
+		    struct ufs_extattr_header *, off_t *);
+
+/*
+ * Per-FS attribute lock protecting attribute operations.
+ * XXX Right now there is a lot of lock contention due to having a single
+ * lock per-FS; really, this should be far more fine-grained.
+ */
+static void
+ufs_extattr_uepm_lock(struct ufsmount *ump)
+{
+
+	/* XXX Why does this need to be recursive? */
+	if (mutex_owned(&ump->um_extattr.uepm_lock)) {
+		ump->um_extattr.uepm_lockcnt++;
+		return;
+	}
+	mutex_enter(&ump->um_extattr.uepm_lock);
+}
+
+static void
+ufs_extattr_uepm_unlock(struct ufsmount *ump)
+{
+
+	if (ump->um_extattr.uepm_lockcnt != 0) {
+		KASSERT(mutex_owned(&ump->um_extattr.uepm_lock));
+		ump->um_extattr.uepm_lockcnt--;
+		return;
+	}
+	mutex_exit(&ump->um_extattr.uepm_lock);
+}
+
+/*-
+ * Determine whether the name passed is a valid name for an actual
+ * attribute.
+ *
+ * Invalid currently consists of:
+ *	 NULL pointer for attrname
+ *	 zero-length attrname (used to retrieve application attribute list)
+ */
+static int
+ufs_extattr_valid_attrname(int attrnamespace, const char *attrname)
+{
+
+	if (attrname == NULL)
+		return (0);
+	if (strlen(attrname) == 0)
+		return (0);
+	return (1);
+}
+
+/*
+ * Autocreate an attribute storage
+ */
+static struct ufs_extattr_list_entry *
+ufs_extattr_autocreate_attr(struct vnode *vp, int attrnamespace,
+    const char *attrname, struct lwp *l)
+{
+	struct mount *mp = vp->v_mount;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct vnode *backing_vp;
+	struct nameidata nd;
+	struct pathbuf *pb;
+	char *path;
+	struct ufs_extattr_fileheader uef;
+	struct ufs_extattr_list_entry *uele;
+	int error;
+
+	path = PNBUF_GET();
+
+	/* 
+	 * We only support system and user namespace autocreation
+	 */ 
+	switch (attrnamespace) {
+	case EXTATTR_NAMESPACE_SYSTEM:
+		(void)snprintf(path, PATH_MAX, "%s/%s/%s/%s", 
+			       mp->mnt_stat.f_mntonname,
+			       UFS_EXTATTR_FSROOTSUBDIR,
+			       UFS_EXTATTR_SUBDIR_SYSTEM,
+			       attrname);
+		break;
+	case EXTATTR_NAMESPACE_USER:
+		(void)snprintf(path, PATH_MAX, "%s/%s/%s/%s", 
+			       mp->mnt_stat.f_mntonname,
+			       UFS_EXTATTR_FSROOTSUBDIR,
+			       UFS_EXTATTR_SUBDIR_USER,
+			       attrname);
+		break;
+	default:
+		PNBUF_PUT(path);
+		return NULL;
+		break;
+	}
+
+	/*
+	 * When setting attribute on the root vnode, we get it 
+	 * already locked, and vn_open/namei/VFS_ROOT will try to
+	 * look it, causing a panic. Unlock it first.
+	 */ 
+	if (vp->v_vflag && VV_ROOT) {
+		KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+		VOP_UNLOCK(vp);
+	}
+	KASSERT(VOP_ISLOCKED(vp) == 0);
+
+	pb = pathbuf_create(path);
+	NDINIT(&nd, CREATE, LOCKPARENT, pb);
+	
+	error = vn_open(&nd, O_CREAT|O_RDWR, 0600);
+
+	/*
+	 * Reacquire the lock on the vnode if it was root.
+	 */
+	KASSERT(VOP_ISLOCKED(vp) == 0);
+	if (vp->v_vflag && VV_ROOT)
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+
+	if (error != 0) {
+		pathbuf_destroy(pb);
+		PNBUF_PUT(path);
+		return NULL;
+	}
+
+	KASSERT(nd.ni_vp != NULL);
+	KASSERT(VOP_ISLOCKED(nd.ni_vp) == LK_EXCLUSIVE);
+	KASSERT(VOP_ISLOCKED(nd.ni_dvp) == 0);
+
+	/*
+ 	 * backing_vp is the backing store. 
+	 */	
+	backing_vp = nd.ni_vp;
+	pathbuf_destroy(pb);
+	PNBUF_PUT(path);
+
+	uef.uef_magic = UFS_EXTATTR_MAGIC;
+	uef.uef_version = UFS_EXTATTR_VERSION;
+	uef.uef_size = ufs_extattr_autocreate;
+
+	error = vn_rdwr(UIO_WRITE, backing_vp, &uef, sizeof(uef), 0,
+		        UIO_SYSSPACE, IO_NODELOCKED|IO_APPEND, 
+			l->l_cred, NULL, l);
+
+	VOP_UNLOCK(backing_vp);
+
+	if (error != 0) {
+		printf("%s: write uef header failed for %s, error = %d\n", 
+		       __func__, attrname, error);
+		vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
+		return NULL;
+	}
+
+	/*
+	 * ufs_extattr_enable_with_open increases the vnode reference
+	 * count. Not sure why, but do the same here.
+	 */
+	vref(vp);
+
+	/*
+	 * Now enable attribute. 
+	 */
+	error = ufs_extattr_enable(ump,attrnamespace, attrname, backing_vp, l);
+	KASSERT(VOP_ISLOCKED(backing_vp) == 0);
+
+	if (error != 0) {
+		printf("%s: enable %s failed, error %d\n", 
+		       __func__, attrname, error);
+		vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
+		return NULL;
+	}
+
+	uele = ufs_extattr_find_attr(ump, attrnamespace, attrname);
+	if (uele == NULL) {
+		printf("%s: atttribute %s created but not found!\n",
+		       __func__, attrname);
+		vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
+		return NULL;
+	}
+
+	printf("%s: EA backing store autocreated for %s\n",
+	       mp->mnt_stat.f_mntonname, attrname);
+
+	return uele;
+}
+
+/*
+ * Locate an attribute given a name and mountpoint.
+ * Must be holding uepm lock for the mount point.
+ */
+static struct ufs_extattr_list_entry *
+ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace,
+    const char *attrname)
+{
+	struct ufs_extattr_list_entry *search_attribute;
+
+	for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list);
+	    search_attribute != NULL;
+	    search_attribute = LIST_NEXT(search_attribute, uele_entries)) {
+		if (!(strncmp(attrname, search_attribute->uele_attrname,
+		    UFS_EXTATTR_MAXEXTATTRNAME)) &&
+		    (attrnamespace == search_attribute->uele_attrnamespace)) {
+			return (search_attribute);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Initialize per-FS structures supporting extended attributes.  Do not
+ * start extended attributes yet.
+ */
+void
+ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm)
+{
+
+	uepm->uepm_flags = 0;
+	uepm->uepm_lockcnt = 0;
+
+	LIST_INIT(&uepm->uepm_list);
+	mutex_init(&uepm->uepm_lock, MUTEX_DEFAULT, IPL_NONE);
+	uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED;
+}
+
+/*
+ * Destroy per-FS structures supporting extended attributes.  Assumes
+ * that EAs have already been stopped, and will panic if not.
+ */
+void
+ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm)
+{
+
+	if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
+		panic("ufs_extattr_uepm_destroy: not initialized");
+
+	if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+		panic("ufs_extattr_uepm_destroy: called while still started");
+
+	/*
+	 * It's not clear that either order for the next two lines is
+	 * ideal, and it should never be a problem if this is only called
+	 * during unmount, and with vfs_busy().
+	 */
+	uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED;
+	mutex_destroy(&uepm->uepm_lock);
+}
+
+/*
+ * Start extended attribute support on an FS.
+ */
+int
+ufs_extattr_start(struct mount *mp, struct lwp *l)
+{
+	struct ufsmount *ump;
+	int error = 0;
+
+	ump = VFSTOUFS(mp);
+
+	ufs_extattr_uepm_lock(ump);
+
+	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) {
+		error = EOPNOTSUPP;
+		goto unlock;
+	}
+	if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) {
+		error = EBUSY;
+		goto unlock;
+	}
+
+	ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED;
+
+	ump->um_extattr.uepm_ucred = l->l_cred;
+	kauth_cred_hold(ump->um_extattr.uepm_ucred);
+
+ unlock:
+	ufs_extattr_uepm_unlock(ump);
+
+	return (error);
+}
+
+/*
+ * Helper routine: given a locked parent directory and filename, return
+ * the locked vnode of the inode associated with the name.  Will not
+ * follow symlinks, may return any type of vnode.  Lock on parent will
+ * be released even in the event of a failure.  In the event that the
+ * target is the parent (i.e., "."), there will be two references and
+ * one lock, requiring the caller to possibly special-case.
+ */
+static int
+ufs_extattr_lookup(struct vnode *start_dvp, int lockparent, const char *dirname,
+    struct vnode **vp, struct lwp *l)
+{
+	struct vop_lookup_args vargs;
+	struct componentname cnp;
+	struct vnode *target_vp;
+	char *pnbuf;
+	int error;
+
+	KASSERT(VOP_ISLOCKED(start_dvp) == LK_EXCLUSIVE);
+
+	pnbuf = PNBUF_GET();
+
+	memset(&cnp, 0, sizeof(cnp));
+	cnp.cn_nameiop = LOOKUP;
+	cnp.cn_flags = ISLASTCN | lockparent;
+	cnp.cn_cred = l->l_cred;
+	cnp.cn_nameptr = pnbuf;
+	error = copystr(dirname, pnbuf, MAXPATHLEN, &cnp.cn_namelen);
+	if (error) {
+		if (lockparent == 0) {
+			VOP_UNLOCK(start_dvp);
+		}
+		PNBUF_PUT(pnbuf);
+		printf("ufs_extattr_lookup: copystr failed\n");
+		return (error);
+	}
+	cnp.cn_namelen--;	/* trim nul termination */
+	vargs.a_desc = NULL;
+	vargs.a_dvp = start_dvp;
+	vargs.a_vpp = &target_vp;
+	vargs.a_cnp = &cnp;
+	error = ufs_lookup(&vargs);
+	PNBUF_PUT(pnbuf);
+	if (error) {
+		if (lockparent == 0) {
+			VOP_UNLOCK(start_dvp);
+		}
+		return (error);
+	}
+#if 0
+	if (target_vp == start_dvp)
+		panic("ufs_extattr_lookup: target_vp == start_dvp");
+#endif
+
+	if ((target_vp != start_dvp) && (lockparent == 0))
+		 VOP_UNLOCK(start_dvp);
+
+	KASSERT(VOP_ISLOCKED(target_vp) == LK_EXCLUSIVE);
+	*vp = target_vp;
+	return (0);
+}
+
+/*
+ * Enable an EA using the passed filesystem, backing vnode, attribute name,
+ * namespace, and proc.  Will perform a VOP_OPEN() on the vp, so expects vp
+ * to be locked when passed in.  The vnode will be returned unlocked,
+ * regardless of success/failure of the function.  As a result, the caller
+ * will always need to vrele(), but not vput().
+ */
+static int
+ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp,
+    int attrnamespace, const char *attrname, struct lwp *l)
+{
+	int error;
+
+	error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred);
+	if (error) {
+		printf("ufs_extattr_enable_with_open.VOP_OPEN(): failed "
+		    "with %d\n", error);
+		VOP_UNLOCK(vp);
+		return (error);
+	}
+
+	mutex_enter(vp->v_interlock);
+	vp->v_writecount++;
+	mutex_exit(vp->v_interlock);
+
+	vref(vp);
+
+	VOP_UNLOCK(vp);
+
+	error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, l);
+	if (error != 0)
+		vn_close(vp, FREAD|FWRITE, l->l_cred);
+	return (error);
+}
+
+/*
+ * Given a locked directory vnode, iterate over the names in the directory
+ * and use ufs_extattr_lookup() to retrieve locked vnodes of potential
+ * attribute files.  Then invoke ufs_extattr_enable_with_open() on each
+ * to attempt to start the attribute.  Leaves the directory locked on
+ * exit.
+ */
+static int
+ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp,
+    int attrnamespace, struct lwp *l)
+{
+	struct vop_readdir_args vargs;
+	struct statvfs *sbp = &ump->um_mountp->mnt_stat;
+	struct dirent *dp, *edp;
+	struct vnode *attr_vp;
+	struct uio auio;
+	struct iovec aiov;
+	char *dirbuf;
+	int error, eofflag = 0;
+
+	if (dvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	dirbuf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK);
+
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_offset = 0;
+	UIO_SETUP_SYSSPACE(&auio);
+
+	vargs.a_desc = NULL;
+	vargs.a_vp = dvp;
+	vargs.a_uio = &auio;
+	vargs.a_cred = l->l_cred;
+	vargs.a_eofflag = &eofflag;
+	vargs.a_ncookies = NULL;
+	vargs.a_cookies = NULL;
+
+	while (!eofflag) {
+		auio.uio_resid = DIRBLKSIZ;
+		aiov.iov_base = dirbuf;
+		aiov.iov_len = DIRBLKSIZ;
+		error = ufs_readdir(&vargs);
+		if (error) {
+			printf("ufs_extattr_iterate_directory: ufs_readdir "
+			    "%d\n", error);
+			return (error);
+		}
+
+		/*
+		 * XXXRW: While in UFS, we always get DIRBLKSIZ returns from
+		 * the directory code on success, on other file systems this
+		 * may not be the case.  For portability, we should check the
+		 * read length on return from ufs_readdir().
+		 */
+		edp = (struct dirent *)&dirbuf[DIRBLKSIZ];
+		for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+			if (dp->d_reclen == 0)
+				break;
+			/* Skip "." and ".." */
+			if (dp->d_name[0] == '.' &&
+			    (dp->d_name[1] == '\0' ||
+			     (dp->d_name[1] == '.' && dp->d_name[2] == '\0')))
+				goto next;
+			error = ufs_extattr_lookup(dvp, LOCKPARENT,
+			    dp->d_name, &attr_vp, l);
+			if (error == ENOENT) {
+				goto next; /* keep silent */
+			} else if (error) {
+				printf("ufs_extattr_iterate_directory: lookup "
+				    "%s %d\n", dp->d_name, error);
+			} else if (attr_vp == dvp) {
+				vrele(attr_vp);
+			} else if (attr_vp->v_type != VREG) {
+				vput(attr_vp);
+			} else {
+				error = ufs_extattr_enable_with_open(ump,
+				    attr_vp, attrnamespace, dp->d_name, l);
+				vrele(attr_vp);
+				if (error) {
+					printf("ufs_extattr_iterate_directory: "
+					    "enable %s %d\n", dp->d_name,
+					    error);
+				} else if (bootverbose) {
+					printf("%s: EA %s loaded\n",
+					       sbp->f_mntonname, dp->d_name);
+				}
+			}
+ next:
+			dp = (struct dirent *) ((char *)dp + dp->d_reclen);
+			if (dp >= edp)
+				break;
+		}
+	}
+	free(dirbuf, M_TEMP);
+	
+	return (0);
+}
+
+/*
+ * Auto-start of extended attributes, to be executed (optionally) at
+ * mount-time.
+ */
+int
+ufs_extattr_autostart(struct mount *mp, struct lwp *l)
+{
+	struct vnode *rvp, *attr_dvp, *attr_system_dvp, *attr_user_dvp;
+	int error;
+
+	/*
+	 * Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root?
+	 * If so, automatically start EA's.
+	 */
+	error = VFS_ROOT(mp, &rvp);
+	if (error) {
+		printf("ufs_extattr_autostart.VFS_ROOT() returned %d\n",
+		    error);
+		return (error);
+	}
+
+	KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE);
+
+	error = ufs_extattr_lookup(rvp, 0,
+	    UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, l);
+	if (error) {
+		/* rvp ref'd but now unlocked */
+		KASSERT(VOP_ISLOCKED(rvp) == 0);
+		vrele(rvp);
+		return (error);
+	}
+	if (rvp == attr_dvp) {
+		/* Should never happen. */
+		KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE);
+		vrele(attr_dvp);
+		vput(rvp);
+		return (EINVAL);
+	}
+	KASSERT(VOP_ISLOCKED(rvp) == 0);
+	vrele(rvp);
+
+	KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
+
+	if (attr_dvp->v_type != VDIR) {
+		printf("ufs_extattr_autostart: %s != VDIR\n",
+		    UFS_EXTATTR_FSROOTSUBDIR);
+		goto return_vput_attr_dvp;
+	}
+
+	error = ufs_extattr_start(mp, l);
+	if (error) {
+		printf("ufs_extattr_autostart: ufs_extattr_start failed (%d)\n",
+		    error);
+		goto return_vput_attr_dvp;
+	}
+
+	/*
+	 * Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM,
+	 * UFS_EXTATTR_SUBDIR_USER.  For each, iterate over the sub-directory,
+	 * and start with appropriate type.  Failures in either don't
+	 * result in an over-all failure.  attr_dvp is left locked to
+	 * be cleaned up on exit.
+	 */
+	error = ufs_extattr_lookup(attr_dvp, LOCKPARENT,
+	    UFS_EXTATTR_SUBDIR_SYSTEM, &attr_system_dvp, l);
+	KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
+	if (error == 0) {
+		KASSERT(VOP_ISLOCKED(attr_system_dvp) == LK_EXCLUSIVE);
+		error = ufs_extattr_iterate_directory(VFSTOUFS(mp),
+		    attr_system_dvp, EXTATTR_NAMESPACE_SYSTEM, l);
+		if (error)
+			printf("ufs_extattr_iterate_directory returned %d\n",
+			    error);
+		KASSERT(VOP_ISLOCKED(attr_system_dvp) == LK_EXCLUSIVE);
+		vput(attr_system_dvp);
+	}
+
+	error = ufs_extattr_lookup(attr_dvp, LOCKPARENT,
+	    UFS_EXTATTR_SUBDIR_USER, &attr_user_dvp, l);
+	KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
+	if (error == 0) {
+		KASSERT(VOP_ISLOCKED(attr_user_dvp) == LK_EXCLUSIVE);
+		error = ufs_extattr_iterate_directory(VFSTOUFS(mp),
+		    attr_user_dvp, EXTATTR_NAMESPACE_USER, l);
+		if (error)
+			printf("ufs_extattr_iterate_directory returned %d\n",
+			    error);
+		KASSERT(VOP_ISLOCKED(attr_user_dvp) == LK_EXCLUSIVE);
+		vput(attr_user_dvp);
+	}
+
+	/* Mask startup failures in sub-directories. */
+	error = 0;
+
+ return_vput_attr_dvp:
+	KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
+	vput(attr_dvp);
+
+	return (error);
+}
+
+/*
+ * Stop extended attribute support on an FS.
+ */
+void
+ufs_extattr_stop(struct mount *mp, struct lwp *l)
+{
+	struct ufs_extattr_list_entry *uele;
+	struct ufsmount *ump = VFSTOUFS(mp);
+
+	ufs_extattr_uepm_lock(ump);
+
+	/*
+	 * If we haven't been started, no big deal.  Just short-circuit
+	 * the processing work.
+	 */
+	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
+		goto unlock;
+	}
+
+	while (LIST_FIRST(&ump->um_extattr.uepm_list) != NULL) {
+		uele = LIST_FIRST(&ump->um_extattr.uepm_list);
+		ufs_extattr_disable(ump, uele->uele_attrnamespace,
+		    uele->uele_attrname, l);
+	}
+
+	ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED;
+
+	kauth_cred_free(ump->um_extattr.uepm_ucred);
+	ump->um_extattr.uepm_ucred = NULL;
+
+ unlock:
+	ufs_extattr_uepm_unlock(ump);
+}
+
+/*
+ * Enable a named attribute on the specified filesystem; provide an
+ * unlocked backing vnode to hold the attribute data.
+ */
+static int
+ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
+    const char *attrname, struct vnode *backing_vnode, struct lwp *l)
+{
+	struct ufs_extattr_list_entry *attribute;
+	struct iovec aiov;
+	struct uio auio;
+	int error = 0;
+
+	if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
+		return (EINVAL);
+	if (backing_vnode->v_type != VREG)
+		return (EINVAL);
+
+	attribute = malloc(sizeof(*attribute), M_UFS_EXTATTR,
+	    M_WAITOK | M_ZERO);
+
+	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
+		error = EOPNOTSUPP;
+		goto free_exit;
+	}
+
+	if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) {
+		error = EEXIST;
+		goto free_exit;
+	}
+
+	strncpy(attribute->uele_attrname, attrname,
+	    UFS_EXTATTR_MAXEXTATTRNAME);
+	attribute->uele_attrnamespace = attrnamespace;
+	memset(&attribute->uele_fileheader, 0,
+	    sizeof(struct ufs_extattr_fileheader));
+	
+	attribute->uele_backing_vnode = backing_vnode;
+
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	aiov.iov_base = (void *) &attribute->uele_fileheader;
+	aiov.iov_len = sizeof(struct ufs_extattr_fileheader);
+	auio.uio_resid = sizeof(struct ufs_extattr_fileheader);
+	auio.uio_offset = (off_t) 0;
+	auio.uio_rw = UIO_READ;
+	UIO_SETUP_SYSSPACE(&auio);
+
+	vn_lock(backing_vnode, LK_SHARED | LK_RETRY);
+	error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED,
+	    ump->um_extattr.uepm_ucred);
+
+	if (error)
+		goto unlock_free_exit;
+
+	if (auio.uio_resid != 0) {
+		printf("ufs_extattr_enable: malformed attribute header\n");
+		error = EINVAL;
+		goto unlock_free_exit;
+	}
+
+	/*
+	 * Try to determine the byte order of the attribute file.
+	 */
+	if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) {
+		attribute->uele_flags |= UELE_F_NEEDSWAP;
+		attribute->uele_fileheader.uef_magic =
+		    ufs_rw32(attribute->uele_fileheader.uef_magic,
+			     UELE_NEEDSWAP(attribute));
+		if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) {
+			printf("ufs_extattr_enable: invalid attribute header "
+			       "magic\n");
+			error = EINVAL;
+			goto unlock_free_exit;
+		}
+	}
+	attribute->uele_fileheader.uef_version =
+	    ufs_rw32(attribute->uele_fileheader.uef_version,
+		     UELE_NEEDSWAP(attribute));
+	attribute->uele_fileheader.uef_size =
+	    ufs_rw32(attribute->uele_fileheader.uef_size,
+		     UELE_NEEDSWAP(attribute));
+
+	if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) {
+		printf("ufs_extattr_enable: incorrect attribute header "
+		    "version\n");
+		error = EINVAL;
+		goto unlock_free_exit;
+	}
+
+	LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute,
+	    uele_entries);
+
+	VOP_UNLOCK(backing_vnode);
+	return (0);
+
+ unlock_free_exit:
+	VOP_UNLOCK(backing_vnode);
+
+ free_exit:
+	free(attribute, M_UFS_EXTATTR);
+	return (error);
+}
+
+/*
+ * Disable extended attribute support on an FS.
+ */
+static int
+ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
+    const char *attrname, struct lwp *l)
+{
+	struct ufs_extattr_list_entry *uele;
+	int error = 0;
+
+	if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
+		return (EINVAL);
+
+	uele = ufs_extattr_find_attr(ump, attrnamespace, attrname);
+	if (!uele)
+		return (ENOATTR);
+
+	LIST_REMOVE(uele, uele_entries);
+
+	error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE,
+	    l->l_cred);
+
+	free(uele, M_UFS_EXTATTR);
+
+	return (error);
+}
+
+/*
+ * VFS call to manage extended attributes in UFS.  If filename_vp is
+ * non-NULL, it must be passed in locked, and regardless of errors in
+ * processing, will be unlocked.
+ */
+int
+ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
+    int attrnamespace, const char *attrname)
+{
+	struct lwp *l = curlwp;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	int error;
+
+	/*
+	 * Only privileged processes can configure extended attributes.
+	 */
+	if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+	    NULL)) != 0) {
+		if (filename_vp != NULL)
+			VOP_UNLOCK(filename_vp);
+		return (error);
+	}
+
+	switch(cmd) {
+	case UFS_EXTATTR_CMD_START:
+		if (filename_vp != NULL) {
+			VOP_UNLOCK(filename_vp);
+			return (EINVAL);
+		}
+		if (attrname != NULL)
+			return (EINVAL);
+
+		error = ufs_extattr_autostart(mp, l);
+		return (error);
+		
+	case UFS_EXTATTR_CMD_STOP:
+		if (filename_vp != NULL) {
+			VOP_UNLOCK(filename_vp);
+			return (EINVAL);
+		}
+		if (attrname != NULL)
+			return (EINVAL);
+
+		ufs_extattr_stop(mp, l);
+		return (0);
+
+	case UFS_EXTATTR_CMD_ENABLE:
+		if (filename_vp == NULL)
+			return (EINVAL);
+		if (attrname == NULL) {
+			VOP_UNLOCK(filename_vp);
+			return (EINVAL);
+		}
+
+		/*
+		 * ufs_extattr_enable_with_open() will always unlock the
+		 * vnode, regardless of failure.
+		 */
+		ufs_extattr_uepm_lock(ump);
+		error = ufs_extattr_enable_with_open(ump, filename_vp,
+		    attrnamespace, attrname, l);
+		ufs_extattr_uepm_unlock(ump);
+		return (error);
+
+	case UFS_EXTATTR_CMD_DISABLE:
+		if (filename_vp != NULL) {
+			VOP_UNLOCK(filename_vp);
+			return (EINVAL);
+		}
+		if (attrname == NULL)
+			return (EINVAL);
+
+		ufs_extattr_uepm_lock(ump);
+		error = ufs_extattr_disable(ump, attrnamespace, attrname, l);
+		ufs_extattr_uepm_unlock(ump);
+		return (error);
+
+	default:
+		return (EINVAL);
+	}
+}
+
+/*
+ * Read extended attribute header for a given vnode and attribute.
+ * Backing vnode should be locked and unlocked by caller.
+ */
+static int
+ufs_extattr_get_header(struct vnode *vp, struct ufs_extattr_list_entry *uele,
+    struct ufs_extattr_header *ueh, off_t *bap)
+{
+	struct mount *mp = vp->v_mount;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct inode *ip = VTOI(vp);
+	off_t base_offset;
+	struct iovec aiov;
+	struct uio aio;
+	int error;
+
+	/*
+	 * Find base offset of header in file based on file header size, and
+	 * data header size + maximum data size, indexed by inode number.
+	 */
+	base_offset = sizeof(struct ufs_extattr_fileheader) +
+	    ip->i_number * (sizeof(struct ufs_extattr_header) +
+	    uele->uele_fileheader.uef_size);
+
+	/*
+	 * Read in the data header to see if the data is defined, and if so
+	 * how much.
+	 */
+	memset(ueh, 0, sizeof(struct ufs_extattr_header));
+	aiov.iov_base = ueh;
+	aiov.iov_len = sizeof(struct ufs_extattr_header);
+	aio.uio_iov = &aiov;
+	aio.uio_iovcnt = 1;
+	aio.uio_rw = UIO_READ;
+	aio.uio_offset = base_offset;
+	aio.uio_resid = sizeof(struct ufs_extattr_header);
+	UIO_SETUP_SYSSPACE(&aio);
+
+	error = VOP_READ(uele->uele_backing_vnode, &aio,
+	    IO_NODELOCKED, ump->um_extattr.uepm_ucred);
+	if (error)
+		return error;
+
+	/*
+	 * Attribute headers are kept in file system byte order.
+	 * XXX What about the blob of data?
+	 */
+	ueh->ueh_flags = ufs_rw32(ueh->ueh_flags, UELE_NEEDSWAP(uele));
+	ueh->ueh_len   = ufs_rw32(ueh->ueh_len, UELE_NEEDSWAP(uele));
+	ueh->ueh_i_gen = ufs_rw32(ueh->ueh_i_gen, UELE_NEEDSWAP(uele));
+
+	/* Defined? */
+	if ((ueh->ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0)
+		return ENOATTR;
+
+	/* Valid for the current inode generation? */
+	if (ueh->ueh_i_gen != ip->i_gen) {
+		/*
+		 * The inode itself has a different generation number
+		 * than the uele data.  For now, the best solution
+		 * is to coerce this to undefined, and let it get cleaned
+		 * up by the next write or extattrctl clean.
+		 */
+		printf("%s (%s): inode gen inconsistency (%u, %jd)\n",
+		       __func__,  mp->mnt_stat.f_mntonname, ueh->ueh_i_gen,
+		       (intmax_t)ip->i_gen);
+		return ENOATTR;
+	}
+
+	/* Local size consistency check. */
+	if (ueh->ueh_len > uele->uele_fileheader.uef_size)
+		return ENXIO;
+
+	/* Return base offset */
+	if (bap != NULL)
+		*bap = base_offset;
+
+	return 0;
+}
+
+/*
+ * Vnode operation to retrieve a named extended attribute.
+ */
+int
+ufs_getextattr(struct vop_getextattr_args *ap)
+/*
+vop_getextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	IN const char *a_name;
+	INOUT struct uio *a_uio;
+	OUT size_t *a_size;
+	IN kauth_cred_t a_cred;
+};
+*/
+{
+	struct mount *mp = ap->a_vp->v_mount;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	int error;
+
+	ufs_extattr_uepm_lock(ump);
+
+	error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name,
+	    ap->a_uio, ap->a_size, ap->a_cred, curlwp);
+
+	ufs_extattr_uepm_unlock(ump);
+
+	return (error);
+}
+
+/*
+ * Real work associated with retrieving a named attribute--assumes that
+ * the attribute lock has already been grabbed.
+ */
+static int
+ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name,
+    struct uio *uio, size_t *size, kauth_cred_t cred, struct lwp *l)
+{
+	struct ufs_extattr_list_entry *attribute;
+	struct ufs_extattr_header ueh;
+	struct mount *mp = vp->v_mount;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	off_t base_offset;
+	size_t len, old_len;
+	int error = 0;
+
+	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+		return (EOPNOTSUPP);
+
+	if (strlen(name) == 0)
+		return (EINVAL);
+
+	error = extattr_check_cred(vp, attrnamespace, cred, l, IREAD);
+	if (error)
+		return (error);
+
+	attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
+	if (!attribute)
+		return (ENOATTR);
+
+	/*
+	 * Allow only offsets of zero to encourage the read/replace
+	 * extended attribute semantic.  Otherwise we can't guarantee
+	 * atomicity, as we don't provide locks for extended attributes.
+	 */
+	if (uio != NULL && uio->uio_offset != 0)
+		return (ENXIO);
+
+	/*
+	 * Don't need to get a lock on the backing file if the getattr is
+	 * being applied to the backing file, as the lock is already held.
+	 */
+	if (attribute->uele_backing_vnode != vp)
+		vn_lock(attribute->uele_backing_vnode, LK_SHARED | LK_RETRY);
+
+	error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset);
+	if (error)
+		goto vopunlock_exit;
+
+	/* Return full data size if caller requested it. */
+	if (size != NULL)
+		*size = ueh.ueh_len;
+
+	/* Return data if the caller requested it. */
+	if (uio != NULL) {
+		/* Allow for offset into the attribute data. */
+		uio->uio_offset = base_offset + sizeof(struct
+		    ufs_extattr_header);
+
+		/*
+		 * Figure out maximum to transfer -- use buffer size and
+		 * local data limit.
+		 */
+		len = MIN(uio->uio_resid, ueh.ueh_len);
+		old_len = uio->uio_resid;
+		uio->uio_resid = len;
+
+		error = VOP_READ(attribute->uele_backing_vnode, uio,
+		    IO_NODELOCKED, ump->um_extattr.uepm_ucred);
+		if (error)
+			goto vopunlock_exit;
+
+		uio->uio_resid = old_len - (len - uio->uio_resid);
+	}
+
+ vopunlock_exit:
+
+	if (uio != NULL)
+		uio->uio_offset = 0;
+
+	if (attribute->uele_backing_vnode != vp)
+		VOP_UNLOCK(attribute->uele_backing_vnode);
+
+	return (error);
+}
+
+/*
+ * Vnode operation to list extended attribute for a vnode
+ */
+int
+ufs_listextattr(struct vop_listextattr_args *ap)
+/*
+vop_listextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	INOUT struct uio *a_uio;
+	OUT size_t *a_size;
+	IN int flag;
+	IN kauth_cred_t a_cred;
+	struct proc *a_p;
+};
+*/
+{
+	struct mount *mp = ap->a_vp->v_mount;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	int error;
+
+	ufs_extattr_uepm_lock(ump);
+
+	error = ufs_extattr_list(ap->a_vp, ap->a_attrnamespace,
+	    ap->a_uio, ap->a_size, ap->a_flag, ap->a_cred, curlwp);
+
+	ufs_extattr_uepm_unlock(ump);
+
+	return (error);
+}
+
+/*
+ * Real work associated with retrieving list of attributes--assumes that
+ * the attribute lock has already been grabbed.
+ */
+static int
+ufs_extattr_list(struct vnode *vp, int attrnamespace,
+    struct uio *uio, size_t *size, int flag, 
+    kauth_cred_t cred, struct lwp *l)
+{
+	struct ufs_extattr_list_entry *uele;
+	struct ufs_extattr_header ueh;
+	struct mount *mp = vp->v_mount;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	size_t listsize = 0;
+	int error = 0;
+
+	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+		return (EOPNOTSUPP);
+
+	error = extattr_check_cred(vp, attrnamespace, cred, l, IREAD);
+	if (error)
+		return (error);
+
+	LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) {
+		unsigned char attrnamelen;
+
+		if (uele->uele_attrnamespace != attrnamespace)
+			continue;
+
+		error = ufs_extattr_get_header(vp, uele, &ueh, NULL);
+		if (error == ENOATTR)
+			continue;	
+		if (error != 0)
+			return error;
+
+		/*
+		 * Don't need to get a lock on the backing file if 
+		 * the listattr is being applied to the backing file, 
+		 * as the lock is already held.
+		 */
+		if (uele->uele_backing_vnode != vp)
+			vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY);
+
+		/*
+		 * +1 for trailing NUL (listxattr flavor)
+		 *  or leading name length (extattr_list_file flavor)
+	 	 */
+		attrnamelen = strlen(uele->uele_attrname);
+		listsize += attrnamelen + 1;
+
+		/* Return data if the caller requested it. */
+		if (uio != NULL) {
+			/*
+			 * We support two flavors. Either NUL-terminated
+			 * strings (a la listxattr), or non NUL-terminated,
+			 * one byte length prefixed strings (for
+			 * extattr_list_file). EXTATTR_LIST_LENPREFIX switches
+		 	 * that second behavior.
+			 */
+			if (flag & EXTATTR_LIST_LENPREFIX) {
+				uint8_t len = (uint8_t)attrnamelen;
+
+				/* Copy leading name length */
+				error = uiomove(&len, sizeof(len), uio);
+				if (error != 0)
+					break;	
+			} else {
+				/* Include trailing NULL */
+				attrnamelen++; 
+			}
+
+			error = uiomove(uele->uele_attrname, 
+					(size_t)attrnamelen, uio);
+			if (error != 0)
+				break;	
+		}
+
+		if (uele->uele_backing_vnode != vp)
+			VOP_UNLOCK(uele->uele_backing_vnode);
+
+		if (error != 0)
+			return error;
+	}
+
+	if (uio != NULL)
+		uio->uio_offset = 0;
+
+	/* Return full data size if caller requested it. */
+	if (size != NULL)
+		*size = listsize;
+
+	return 0;
+}
+
+/*
+ * Vnode operation to remove a named attribute.
+ */
+int
+ufs_deleteextattr(struct vop_deleteextattr_args *ap)
+/*
+vop_deleteextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	IN const char *a_name;
+	IN kauth_cred_t a_cred;
+};
+*/
+{
+	struct mount *mp = ap->a_vp->v_mount;
+	struct ufsmount *ump = VFSTOUFS(mp); 
+	int error;
+
+	ufs_extattr_uepm_lock(ump);
+
+	error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name,
+	    ap->a_cred, curlwp);
+
+	ufs_extattr_uepm_unlock(ump);
+
+	return (error);
+}
+
+/*
+ * Vnode operation to set a named attribute.
+ */
+int
+ufs_setextattr(struct vop_setextattr_args *ap)
+/*
+vop_setextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	IN const char *a_name;
+	INOUT struct uio *a_uio;
+	IN kauth_cred_t a_cred;
+};
+*/
+{
+	struct mount *mp = ap->a_vp->v_mount;
+	struct ufsmount *ump = VFSTOUFS(mp); 
+	int error;
+
+	ufs_extattr_uepm_lock(ump);
+
+	/*
+	 * XXX: No longer a supported way to delete extended attributes.
+	 */
+	if (ap->a_uio == NULL) {
+		ufs_extattr_uepm_unlock(ump);
+		return (EINVAL);
+	}
+
+	error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name,
+	    ap->a_uio, ap->a_cred, curlwp);
+
+	ufs_extattr_uepm_unlock(ump);
+
+	return (error);
+}
+
+/*
+ * Real work associated with setting a vnode's extended attributes;
+ * assumes that the attribute lock has already been grabbed.
+ */
+static int
+ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name,
+    struct uio *uio, kauth_cred_t cred, struct lwp *l)
+{
+	struct ufs_extattr_list_entry *attribute;
+	struct ufs_extattr_header ueh;
+	struct iovec local_aiov;
+	struct uio local_aio;
+	struct mount *mp = vp->v_mount;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct inode *ip = VTOI(vp);
+	off_t base_offset;
+	int error = 0, ioflag;
+
+	if (vp->v_mount->mnt_flag & MNT_RDONLY)
+		return (EROFS);
+	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+		return (EOPNOTSUPP);
+	if (!ufs_extattr_valid_attrname(attrnamespace, name))
+		return (EINVAL);
+
+	error = extattr_check_cred(vp, attrnamespace, cred, l, IWRITE);
+	if (error)
+		return (error);
+
+	attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
+	if (!attribute) {
+		attribute =  ufs_extattr_autocreate_attr(vp, attrnamespace, 
+							 name, l);
+		if  (!attribute)
+			return (ENOATTR);
+	}
+
+	/*
+	 * Early rejection of invalid offsets/length.
+	 * Reject: any offset but 0 (replace)
+	 *	 Any size greater than attribute size limit
+ 	 */
+	if (uio->uio_offset != 0 ||
+	    uio->uio_resid > attribute->uele_fileheader.uef_size)
+		return (ENXIO);
+
+	/*
+	 * Find base offset of header in file based on file header size, and
+	 * data header size + maximum data size, indexed by inode number.
+	 */
+	base_offset = sizeof(struct ufs_extattr_fileheader) +
+	    ip->i_number * (sizeof(struct ufs_extattr_header) +
+	    attribute->uele_fileheader.uef_size);
+
+	/*
+	 * Write out a data header for the data.
+	 */
+	ueh.ueh_len = ufs_rw32((uint32_t) uio->uio_resid,
+	    UELE_NEEDSWAP(attribute));
+	ueh.ueh_flags = ufs_rw32(UFS_EXTATTR_ATTR_FLAG_INUSE,
+				 UELE_NEEDSWAP(attribute));
+	ueh.ueh_i_gen = ufs_rw32(ip->i_gen, UELE_NEEDSWAP(attribute));
+	local_aiov.iov_base = &ueh;
+	local_aiov.iov_len = sizeof(struct ufs_extattr_header);
+	local_aio.uio_iov = &local_aiov;
+	local_aio.uio_iovcnt = 1;
+	local_aio.uio_rw = UIO_WRITE;
+	local_aio.uio_offset = base_offset;
+	local_aio.uio_resid = sizeof(struct ufs_extattr_header);
+	UIO_SETUP_SYSSPACE(&local_aio);
+
+	/*
+	 * Don't need to get a lock on the backing file if the setattr is
+	 * being applied to the backing file, as the lock is already held.
+	 */
+	if (attribute->uele_backing_vnode != vp)
+		vn_lock(attribute->uele_backing_vnode, 
+		    LK_EXCLUSIVE | LK_RETRY);
+
+	ioflag = IO_NODELOCKED;
+	if (ufs_extattr_sync)
+		ioflag |= IO_SYNC;
+	error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
+	    ump->um_extattr.uepm_ucred);
+	if (error)
+		goto vopunlock_exit;
+
+	if (local_aio.uio_resid != 0) {
+		error = ENXIO;
+		goto vopunlock_exit;
+	}
+
+	/*
+	 * Write out user data.
+	 * XXX NOT ATOMIC WITH RESPECT TO THE HEADER.
+	 */
+	uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header);
+
+	ioflag = IO_NODELOCKED;
+	if (ufs_extattr_sync)
+		ioflag |= IO_SYNC;
+	error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag,
+	    ump->um_extattr.uepm_ucred);
+
+ vopunlock_exit:
+	uio->uio_offset = 0;
+
+	if (attribute->uele_backing_vnode != vp)
+		VOP_UNLOCK(attribute->uele_backing_vnode);
+
+	return (error);
+}
+
+/*
+ * Real work associated with removing an extended attribute from a vnode.
+ * Assumes the attribute lock has already been grabbed.
+ */
+static int
+ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name,
+    kauth_cred_t cred, struct lwp *l)
+{
+	struct ufs_extattr_list_entry *attribute;
+	struct ufs_extattr_header ueh;
+	struct mount *mp = vp->v_mount;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct iovec local_aiov;
+	struct uio local_aio;
+	off_t base_offset;
+	int error = 0, ioflag;
+
+	if (vp->v_mount->mnt_flag & MNT_RDONLY)  
+		return (EROFS);
+	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+		return (EOPNOTSUPP);
+	if (!ufs_extattr_valid_attrname(attrnamespace, name))
+		return (EINVAL);
+
+	error = extattr_check_cred(vp, attrnamespace, cred, l, IWRITE);
+	if (error)
+		return (error);
+
+	attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
+	if (!attribute)
+		return (ENOATTR);
+
+	/*
+	 * Don't need to get a lock on the backing file if the getattr is
+	 * being applied to the backing file, as the lock is already held.
+	 */
+	if (attribute->uele_backing_vnode != vp)
+		vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY);
+
+	error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset);
+	if (error)
+		goto vopunlock_exit;
+
+	/* Flag it as not in use. */
+	ueh.ueh_flags = 0;		/* No need to byte swap 0 */
+	ueh.ueh_len = 0;		/* ...ditto... */
+
+	local_aiov.iov_base = &ueh;
+	local_aiov.iov_len = sizeof(struct ufs_extattr_header);
+	local_aio.uio_iov = &local_aiov;
+	local_aio.uio_iovcnt = 1;
+	local_aio.uio_rw = UIO_WRITE;
+	local_aio.uio_offset = base_offset;
+	local_aio.uio_resid = sizeof(struct ufs_extattr_header);
+	UIO_SETUP_SYSSPACE(&local_aio);
+
+	ioflag = IO_NODELOCKED;
+	if (ufs_extattr_sync)
+		ioflag |= IO_SYNC;
+	error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
+	    ump->um_extattr.uepm_ucred);
+	if (error)
+		goto vopunlock_exit;
+
+	if (local_aio.uio_resid != 0)
+		error = ENXIO;
+
+ vopunlock_exit:
+	VOP_UNLOCK(attribute->uele_backing_vnode);
+
+	return (error);
+}
+
+/*
+ * Called by UFS when an inode is no longer active and should have its
+ * attributes stripped.
+ */
+void
+ufs_extattr_vnode_inactive(struct vnode *vp, struct lwp *l)
+{
+	struct ufs_extattr_list_entry *uele;
+	struct mount *mp = vp->v_mount;
+	struct ufsmount *ump = VFSTOUFS(mp);
+
+	/*
+	 * In that case, we cannot lock. We should not have any active vnodes
+	 * on the fs if this is not yet initialized but is going to be, so
+	 * this can go unlocked.
+	 */
+	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
+		return;
+
+	ufs_extattr_uepm_lock(ump);
+
+	if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
+		ufs_extattr_uepm_unlock(ump);
+		return;
+	}
+
+	LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries)
+		ufs_extattr_rm(vp, uele->uele_attrnamespace,
+		    uele->uele_attrname, lwp0.l_cred, l);
+
+	ufs_extattr_uepm_unlock(ump);
+}
+
+void
+ufs_extattr_init(void)
+{
+
+	malloc_type_attach(M_UFS_EXTATTR);
+}
+
+void
+ufs_extattr_done(void)
+{
+
+	malloc_type_detach(M_UFS_EXTATTR);
+}
diff --git a/include/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h
similarity index 100%
rename from include/ufs/ufs/ufs_extern.h
rename to sys/ufs/ufs/ufs_extern.h
diff --git a/sys/ufs/ufs/ufs_ihash.c b/sys/ufs/ufs/ufs_ihash.c
new file mode 100644
index 000000000..213f3357d
--- /dev/null
+++ b/sys/ufs/ufs/ufs_ihash.c
@@ -0,0 +1,191 @@
+/*	$NetBSD: ufs_ihash.c,v 1.31 2011/06/12 03:36:02 rmind Exp $	*/
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_ihash.c	8.7 (Berkeley) 5/17/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_ihash.c,v 1.31 2011/06/12 03:36:02 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/mutex.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+
+/*
+ * Structures associated with inode cacheing.
+ */
+static LIST_HEAD(ihashhead, inode) *ihashtbl;
+static u_long	ihash;		/* size of hash table - 1 */
+#define INOHASH(device, inum)	(((device) + (inum)) & ihash)
+
+kmutex_t	ufs_ihash_lock;
+kmutex_t	ufs_hashlock;
+
+/*
+ * Initialize inode hash table.
+ */
+void
+ufs_ihashinit(void)
+{
+
+	mutex_init(&ufs_hashlock, MUTEX_DEFAULT, IPL_NONE);
+	mutex_init(&ufs_ihash_lock, MUTEX_DEFAULT, IPL_NONE);
+	ihashtbl = hashinit(desiredvnodes, HASH_LIST, true, &ihash);
+}
+
+/*
+ * Reinitialize inode hash table.
+ */
+
+void
+ufs_ihashreinit(void)
+{
+	struct inode *ip;
+	struct ihashhead *oldhash, *hash;
+	u_long oldmask, mask, val;
+	int i;
+
+	hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
+	mutex_enter(&ufs_ihash_lock);
+	oldhash = ihashtbl;
+	oldmask = ihash;
+	ihashtbl = hash;
+	ihash = mask;
+	for (i = 0; i <= oldmask; i++) {
+		while ((ip = LIST_FIRST(&oldhash[i])) != NULL) {
+			LIST_REMOVE(ip, i_hash);
+			val = INOHASH(ip->i_dev, ip->i_number);
+			LIST_INSERT_HEAD(&hash[val], ip, i_hash);
+		}
+	}
+	mutex_exit(&ufs_ihash_lock);
+	hashdone(oldhash, HASH_LIST, oldmask);
+}
+
+/*
+ * Free inode hash table.
+ */
+void
+ufs_ihashdone(void)
+{
+
+	hashdone(ihashtbl, HASH_LIST, ihash);
+	mutex_destroy(&ufs_hashlock);
+	mutex_destroy(&ufs_ihash_lock);
+}
+
+/*
+ * Use the device/inum pair to find the incore inode, and return a pointer
+ * to it. If it is in core, return it, even if it is locked.
+ */
+struct vnode *
+ufs_ihashlookup(dev_t dev, ino_t inum)
+{
+	struct inode *ip;
+	struct ihashhead *ipp;
+
+	KASSERT(mutex_owned(&ufs_ihash_lock));
+
+	ipp = &ihashtbl[INOHASH(dev, inum)];
+	LIST_FOREACH(ip, ipp, i_hash) {
+		if (inum == ip->i_number && dev == ip->i_dev)
+			break;
+	}
+	if (ip)
+		return (ITOV(ip));
+	return (NULLVP);
+}
+
+/*
+ * Use the device/inum pair to find the incore inode, and return a pointer
+ * to it. If it is in core, but locked, wait for it.
+ */
+struct vnode *
+ufs_ihashget(dev_t dev, ino_t inum, int flags)
+{
+	struct ihashhead *ipp;
+	struct inode *ip;
+	struct vnode *vp;
+
+ loop:
+	mutex_enter(&ufs_ihash_lock);
+	ipp = &ihashtbl[INOHASH(dev, inum)];
+	LIST_FOREACH(ip, ipp, i_hash) {
+		if (inum == ip->i_number && dev == ip->i_dev) {
+			vp = ITOV(ip);
+			if (flags == 0) {
+				mutex_exit(&ufs_ihash_lock);
+			} else {
+				mutex_enter(vp->v_interlock);
+				mutex_exit(&ufs_ihash_lock);
+				if (vget(vp, flags))
+					goto loop;
+			}
+			return (vp);
+		}
+	}
+	mutex_exit(&ufs_ihash_lock);
+	return (NULL);
+}
+
+/*
+ * Insert the inode into the hash table, and return it locked.
+ */
+void
+ufs_ihashins(struct inode *ip)
+{
+	struct ihashhead *ipp;
+
+	KASSERT(mutex_owned(&ufs_hashlock));
+
+	/* lock the inode, then put it on the appropriate hash list */
+	VOP_LOCK(ITOV(ip), LK_EXCLUSIVE);
+
+	mutex_enter(&ufs_ihash_lock);
+	ipp = &ihashtbl[INOHASH(ip->i_dev, ip->i_number)];
+	LIST_INSERT_HEAD(ipp, ip, i_hash);
+	mutex_exit(&ufs_ihash_lock);
+}
+
+/*
+ * Remove the inode from the hash table.
+ */
+void
+ufs_ihashrem(struct inode *ip)
+{
+	mutex_enter(&ufs_ihash_lock);
+	LIST_REMOVE(ip, i_hash);
+	mutex_exit(&ufs_ihash_lock);
+}
diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c
new file mode 100644
index 000000000..7a9eea4ff
--- /dev/null
+++ b/sys/ufs/ufs/ufs_inode.c
@@ -0,0 +1,311 @@
+/*	$NetBSD: ufs_inode.c,v 1.88 2011/09/20 14:01:33 chs Exp $	*/
+
+/*
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_inode.c	8.9 (Berkeley) 5/14/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.88 2011/09/20 14:01:33 chs Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#include "opt_wapbl.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kernel.h>
+#include <sys/namei.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+#include <sys/kmem.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
+#ifdef UFS_DIRHASH
+#include <ufs/ufs/dirhash.h>
+#endif
+#ifdef UFS_EXTATTR
+#include <ufs/ufs/extattr.h>
+#endif
+
+#include <uvm/uvm.h>
+
+extern int prtactive;
+
+/*
+ * Last reference to an inode.  If necessary, write or delete it.
+ */
+int
+ufs_inactive(void *v)
+{
+	struct vop_inactive_args /* {
+		struct vnode *a_vp;
+		struct bool *a_recycle;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	struct mount *transmp;
+	mode_t mode;
+	int error = 0;
+	int logged = 0;
+
+	UFS_WAPBL_JUNLOCK_ASSERT(vp->v_mount);
+
+	transmp = vp->v_mount;
+	fstrans_start(transmp, FSTRANS_LAZY);
+	/*
+	 * Ignore inodes related to stale file handles.
+	 */
+	if (ip->i_mode == 0)
+		goto out;
+	if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
+#ifdef UFS_EXTATTR
+		ufs_extattr_vnode_inactive(vp, curlwp);
+#endif
+		error = UFS_WAPBL_BEGIN(vp->v_mount);
+		if (error)
+			goto out;
+		logged = 1;
+		if (ip->i_size != 0) {
+			/*
+			 * When journaling, only truncate one indirect block
+			 * at a time
+			 */
+			if (vp->v_mount->mnt_wapbl) {
+				uint64_t incr = MNINDIR(ip->i_ump) <<
+				    vp->v_mount->mnt_fs_bshift; /* Power of 2 */
+				uint64_t base = NDADDR <<
+				    vp->v_mount->mnt_fs_bshift;
+				while (!error && ip->i_size > base + incr) {
+					/*
+					 * round down to next full indirect
+					 * block boundary.
+					 */
+					uint64_t nsize = base +
+					    ((ip->i_size - base - 1) &
+					    ~(incr - 1));
+					error = UFS_TRUNCATE(vp, nsize, 0,
+					    NOCRED);
+					if (error)
+						break;
+					UFS_WAPBL_END(vp->v_mount);
+					error = UFS_WAPBL_BEGIN(vp->v_mount);
+					if (error)
+						goto out;
+				}
+			}
+			if (!error)
+				error = UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED);
+		}
+#if defined(QUOTA) || defined(QUOTA2)
+		(void)chkiq(ip, -1, NOCRED, 0);
+#endif
+		DIP_ASSIGN(ip, rdev, 0);
+		mode = ip->i_mode;
+		ip->i_mode = 0;
+		ip->i_omode = mode;
+		DIP_ASSIGN(ip, mode, 0);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		/*
+		 * Defer final inode free and update to ufs_reclaim().
+		 */
+	}
+
+	if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) {
+		if (!logged++) {
+			int err;
+			err = UFS_WAPBL_BEGIN(vp->v_mount);
+			if (err)
+				goto out;
+		}
+		UFS_UPDATE(vp, NULL, NULL, 0);
+	}
+	if (logged)
+		UFS_WAPBL_END(vp->v_mount);
+out:
+	/*
+	 * If we are done with the inode, reclaim it
+	 * so that it can be reused immediately.
+	 */
+	*ap->a_recycle = (ip->i_mode == 0);
+	VOP_UNLOCK(vp);
+	fstrans_done(transmp);
+	return (error);
+}
+
+/*
+ * Reclaim an inode so that it can be used for other purposes.
+ */
+int
+ufs_reclaim(struct vnode *vp)
+{
+	struct inode *ip = VTOI(vp);
+
+	if (prtactive && vp->v_usecount > 1)
+		vprint("ufs_reclaim: pushing active", vp);
+
+	if (!UFS_WAPBL_BEGIN(vp->v_mount)) {
+		UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE);
+		UFS_WAPBL_END(vp->v_mount);
+	}
+	UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE);
+
+	/*
+	 * Remove the inode from its hash chain.
+	 */
+	ufs_ihashrem(ip);
+
+	if (ip->i_devvp) {
+		vrele(ip->i_devvp);
+		ip->i_devvp = 0;
+	}
+#if defined(QUOTA) || defined(QUOTA2)
+	ufsquota_free(ip);
+#endif
+#ifdef UFS_DIRHASH
+	if (ip->i_dirhash != NULL)
+		ufsdirhash_free(ip);
+#endif
+	return (0);
+}
+
+/*
+ * allocate a range of blocks in a file.
+ * after this function returns, any page entirely contained within the range
+ * will map to invalid data and thus must be overwritten before it is made
+ * accessible to others.
+ */
+
+int
+ufs_balloc_range(struct vnode *vp, off_t off, off_t len, kauth_cred_t cred,
+    int flags)
+{
+	off_t neweof;	/* file size after the operation */
+	off_t neweob;	/* offset next to the last block after the operation */
+	off_t pagestart; /* starting offset of range covered by pgs */
+	off_t eob;	/* offset next to allocated blocks */
+	struct uvm_object *uobj;
+	int i, delta, error, npages;
+	int bshift = vp->v_mount->mnt_fs_bshift;
+	int bsize = 1 << bshift;
+	int ppb = MAX(bsize >> PAGE_SHIFT, 1);
+	struct vm_page **pgs;
+	size_t pgssize;
+	UVMHIST_FUNC("ufs_balloc_range"); UVMHIST_CALLED(ubchist);
+	UVMHIST_LOG(ubchist, "vp %p off 0x%x len 0x%x u_size 0x%x",
+		    vp, off, len, vp->v_size);
+
+	neweof = MAX(vp->v_size, off + len);
+	GOP_SIZE(vp, neweof, &neweob, 0);
+
+	error = 0;
+	uobj = &vp->v_uobj;
+
+	/*
+	 * read or create pages covering the range of the allocation and
+	 * keep them locked until the new block is allocated, so there
+	 * will be no window where the old contents of the new block are
+	 * visible to racing threads.
+	 */
+
+	pagestart = trunc_page(off) & ~(bsize - 1);
+	npages = MIN(ppb, (round_page(neweob) - pagestart) >> PAGE_SHIFT);
+	pgssize = npages * sizeof(struct vm_page *);
+	pgs = kmem_zalloc(pgssize, KM_SLEEP);
+
+	/*
+	 * adjust off to be block-aligned.
+	 */
+
+	delta = off & (bsize - 1);
+	off -= delta;
+	len += delta;
+
+	genfs_node_wrlock(vp);
+	mutex_enter(uobj->vmobjlock);
+	error = VOP_GETPAGES(vp, pagestart, pgs, &npages, 0,
+	    VM_PROT_WRITE, 0, PGO_SYNCIO | PGO_PASTEOF | PGO_NOBLOCKALLOC |
+	    PGO_NOTIMESTAMP | PGO_GLOCKHELD);
+	if (error) {
+		goto out;
+	}
+
+	/*
+	 * now allocate the range.
+	 */
+
+	error = GOP_ALLOC(vp, off, len, flags, cred);
+	genfs_node_unlock(vp);
+
+	/*
+	 * if the allocation succeeded, clear PG_CLEAN on all the pages
+	 * and clear PG_RDONLY on any pages that are now fully backed
+	 * by disk blocks.  if the allocation failed, we do not invalidate
+	 * the pages since they might have already existed and been dirty,
+	 * in which case we need to keep them around.  if we created the pages,
+	 * they will be clean and read-only, and leaving such pages
+	 * in the cache won't cause any problems.
+	 */
+
+	GOP_SIZE(vp, off + len, &eob, 0);
+	mutex_enter(uobj->vmobjlock);
+	mutex_enter(&uvm_pageqlock);
+	for (i = 0; i < npages; i++) {
+		KASSERT((pgs[i]->flags & PG_RELEASED) == 0);
+		if (!error) {
+			if (off <= pagestart + (i << PAGE_SHIFT) &&
+			    pagestart + ((i + 1) << PAGE_SHIFT) <= eob) {
+				pgs[i]->flags &= ~PG_RDONLY;
+			}
+			pgs[i]->flags &= ~PG_CLEAN;
+		}
+		uvm_pageactivate(pgs[i]);
+	}
+	mutex_exit(&uvm_pageqlock);
+	uvm_page_unbusy(pgs, npages);
+	mutex_exit(uobj->vmobjlock);
+
+ out:
+ 	kmem_free(pgs, pgssize);
+	return error;
+}
diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c
new file mode 100644
index 000000000..aa395de18
--- /dev/null
+++ b/sys/ufs/ufs/ufs_lookup.c
@@ -0,0 +1,1500 @@
+/*	$NetBSD: ufs_lookup.c,v 1.111 2011/07/17 22:07:59 dholland Exp $	*/
+
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_lookup.c	8.9 (Berkeley) 8/11/94
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.111 2011/07/17 22:07:59 dholland Exp $");
+
+#ifdef _KERNEL_OPT
+#include "opt_ffs.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/kernel.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+#include <sys/proc.h>
+#include <sys/kmem.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#ifdef UFS_DIRHASH
+#include <ufs/ufs/dirhash.h>
+#endif
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#ifdef DIAGNOSTIC
+int	dirchk = 1;
+#else
+int	dirchk = 0;
+#endif
+
+#define	FSFMT(vp)	(((vp)->v_mount->mnt_iflag & IMNT_DTYPE) == 0)
+
+/*
+ * Convert a component of a pathname into a pointer to a locked inode.
+ * This is a very central and rather complicated routine.
+ * If the file system is not maintained in a strict tree hierarchy,
+ * this can result in a deadlock situation (see comments in code below).
+ *
+ * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending
+ * on whether the name is to be looked up, created, renamed, or deleted.
+ * When CREATE, RENAME, or DELETE is specified, information usable in
+ * creating, renaming, or deleting a directory entry may be calculated.
+ * If flag has LOCKPARENT or'ed into it and the target of the pathname
+ * exists, lookup returns both the target and its parent directory locked.
+ * When creating or renaming and LOCKPARENT is specified, the target may
+ * not be ".".  When deleting and LOCKPARENT is specified, the target may
+ * be "."., but the caller must check to ensure it does an vrele and vput
+ * instead of two vputs.
+ *
+ * Overall outline of ufs_lookup:
+ *
+ *	check accessibility of directory
+ *	look for name in cache, if found, then if at end of path
+ *	  and deleting or creating, drop it, else return name
+ *	search for name in directory, to found or notfound
+ * notfound:
+ *	if creating, return locked directory, leaving info on available slots
+ *	else return error
+ * found:
+ *	if at end of path and deleting, return information to allow delete
+ *	if at end of path and rewriting (RENAME and LOCKPARENT), lock target
+ *	  inode and return info to allow rewrite
+ *	if not at end, add name to cache; if at end and neither creating
+ *	  nor deleting, add name to cache
+ */
+int
+ufs_lookup(void *v)
+{
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap = v;
+	struct vnode *vdp = ap->a_dvp;	/* vnode for directory being searched */
+	struct inode *dp = VTOI(vdp);	/* inode for directory being searched */
+	struct buf *bp;			/* a buffer of directory entries */
+	struct direct *ep;		/* the current directory entry */
+	int entryoffsetinblock;		/* offset of ep in bp's buffer */
+	enum {NONE, COMPACT, FOUND} slotstatus;
+	doff_t slotoffset;		/* offset of area with free space */
+	int slotsize;			/* size of area at slotoffset */
+	int slotfreespace;		/* amount of space free in slot */
+	int slotneeded;			/* size of the entry we're seeking */
+	int numdirpasses;		/* strategy for directory search */
+	doff_t endsearch;		/* offset to end directory search */
+	doff_t prevoff;			/* prev entry dp->i_offset */
+	struct vnode *pdp;		/* saved dp during symlink work */
+	struct vnode *tdp;		/* returned by VFS_VGET */
+	doff_t enduseful;		/* pointer past last used dir slot */
+	u_long bmask;			/* block offset mask */
+	int namlen, error;
+	struct vnode **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	kauth_cred_t cred = cnp->cn_cred;
+	int flags;
+	int nameiop = cnp->cn_nameiop;
+	struct ufsmount *ump = dp->i_ump;
+	const int needswap = UFS_MPNEEDSWAP(ump);
+	int dirblksiz = ump->um_dirblksiz;
+	ino_t foundino;
+	struct ufs_lookup_results *results;
+
+	flags = cnp->cn_flags;
+
+	bp = NULL;
+	slotoffset = -1;
+	*vpp = NULL;
+	endsearch = 0; /* silence compiler warning */
+
+	/*
+	 * Produce the auxiliary lookup results into i_crap. Increment
+	 * its serial number so elsewhere we can tell if we're using
+	 * stale results. This should not be done this way. XXX.
+	 */
+	results = &dp->i_crap;
+	dp->i_crapcounter++;
+
+	/*
+	 * Check accessiblity of directory.
+	 */
+	if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0)
+		return (error);
+
+	if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) &&
+	    (nameiop == DELETE || nameiop == RENAME))
+		return (EROFS);
+
+	/*
+	 * We now have a segment name to search for, and a directory to search.
+	 *
+	 * Before tediously performing a linear scan of the directory,
+	 * check the name cache to see if the directory/name pair
+	 * we are looking for is known already.
+	 */
+	if ((error = cache_lookup(vdp, vpp, cnp)) >= 0) {
+		return (error);
+	}
+
+	fstrans_start(vdp->v_mount, FSTRANS_SHARED);
+
+	/*
+	 * Suppress search for slots unless creating
+	 * file and at end of pathname, in which case
+	 * we watch for a place to put the new file in
+	 * case it doesn't already exist.
+	 */
+	slotstatus = FOUND;
+	slotfreespace = slotsize = slotneeded = 0;
+	if ((nameiop == CREATE || nameiop == RENAME) &&
+	    (flags & ISLASTCN)) {
+		slotstatus = NONE;
+		slotneeded = DIRECTSIZ(cnp->cn_namelen);
+	}
+
+	/*
+	 * If there is cached information on a previous search of
+	 * this directory, pick up where we last left off.
+	 * We cache only lookups as these are the most common
+	 * and have the greatest payoff. Caching CREATE has little
+	 * benefit as it usually must search the entire directory
+	 * to determine that the entry does not exist. Caching the
+	 * location of the last DELETE or RENAME has not reduced
+	 * profiling time and hence has been removed in the interest
+	 * of simplicity.
+	 */
+	bmask = vdp->v_mount->mnt_stat.f_iosize - 1;
+
+#ifdef UFS_DIRHASH
+	/*
+	 * Use dirhash for fast operations on large directories. The logic
+	 * to determine whether to hash the directory is contained within
+	 * ufsdirhash_build(); a zero return means that it decided to hash
+	 * this directory and it successfully built up the hash table.
+	 */
+	if (ufsdirhash_build(dp) == 0) {
+		/* Look for a free slot if needed. */
+		enduseful = dp->i_size;
+		if (slotstatus != FOUND) {
+			slotoffset = ufsdirhash_findfree(dp, slotneeded,
+			    &slotsize);
+			if (slotoffset >= 0) {
+				slotstatus = COMPACT;
+				enduseful = ufsdirhash_enduseful(dp);
+				if (enduseful < 0)
+					enduseful = dp->i_size;
+			}
+		}
+		/* Look up the component. */
+		numdirpasses = 1;
+		entryoffsetinblock = 0; /* silence compiler warning */
+		switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen,
+		    &results->ulr_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) {
+		case 0:
+			ep = (struct direct *)((char *)bp->b_data +
+			    (results->ulr_offset & bmask));
+			goto foundentry;
+		case ENOENT:
+			results->ulr_offset = roundup(dp->i_size, dirblksiz);
+			goto notfound;
+		default:
+			/* Something failed; just do a linear search. */
+			break;
+		}
+	}
+#endif /* UFS_DIRHASH */
+
+	if (nameiop != LOOKUP || results->ulr_diroff == 0 ||
+	    results->ulr_diroff >= dp->i_size) {
+		entryoffsetinblock = 0;
+		results->ulr_offset = 0;
+		numdirpasses = 1;
+	} else {
+		results->ulr_offset = results->ulr_diroff;
+		if ((entryoffsetinblock = results->ulr_offset & bmask) &&
+		    (error = ufs_blkatoff(vdp, (off_t)results->ulr_offset,
+		    NULL, &bp, false)))
+			goto out;
+		numdirpasses = 2;
+		nchstats.ncs_2passes++;
+	}
+	prevoff = results->ulr_offset;
+	endsearch = roundup(dp->i_size, dirblksiz);
+	enduseful = 0;
+
+searchloop:
+	while (results->ulr_offset < endsearch) {
+		if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD)
+			preempt();
+		/*
+		 * If necessary, get the next directory block.
+		 */
+		if ((results->ulr_offset & bmask) == 0) {
+			if (bp != NULL)
+				brelse(bp, 0);
+			error = ufs_blkatoff(vdp, (off_t)results->ulr_offset, NULL,
+			    &bp, false);
+			if (error)
+				goto out;
+			entryoffsetinblock = 0;
+		}
+		/*
+		 * If still looking for a slot, and at a DIRBLKSIZ
+		 * boundary, have to start looking for free space again.
+		 */
+		if (slotstatus == NONE &&
+		    (entryoffsetinblock & (dirblksiz - 1)) == 0) {
+			slotoffset = -1;
+			slotfreespace = 0;
+		}
+		/*
+		 * Get pointer to next entry.
+		 * Full validation checks are slow, so we only check
+		 * enough to insure forward progress through the
+		 * directory. Complete checks can be run by patching
+		 * "dirchk" to be true.
+		 */
+		KASSERT(bp != NULL);
+		ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock);
+		if (ep->d_reclen == 0 ||
+		    (dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock))) {
+			int i;
+
+			ufs_dirbad(dp, results->ulr_offset, "mangled entry");
+			i = dirblksiz - (entryoffsetinblock & (dirblksiz - 1));
+			results->ulr_offset += i;
+			entryoffsetinblock += i;
+			continue;
+		}
+
+		/*
+		 * If an appropriate sized slot has not yet been found,
+		 * check to see if one is available. Also accumulate space
+		 * in the current block so that we can determine if
+		 * compaction is viable.
+		 */
+		if (slotstatus != FOUND) {
+			int size = ufs_rw16(ep->d_reclen, needswap);
+
+			if (ep->d_ino != 0)
+				size -= DIRSIZ(FSFMT(vdp), ep, needswap);
+			if (size > 0) {
+				if (size >= slotneeded) {
+					slotstatus = FOUND;
+					slotoffset = results->ulr_offset;
+					slotsize = ufs_rw16(ep->d_reclen,
+					    needswap);
+				} else if (slotstatus == NONE) {
+					slotfreespace += size;
+					if (slotoffset == -1)
+						slotoffset = results->ulr_offset;
+					if (slotfreespace >= slotneeded) {
+						slotstatus = COMPACT;
+						slotsize = results->ulr_offset +
+						    ufs_rw16(ep->d_reclen,
+							     needswap) -
+						    slotoffset;
+					}
+				}
+			}
+		}
+
+		/*
+		 * Check for a name match.
+		 */
+		if (ep->d_ino) {
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+			if (FSFMT(vdp) && needswap == 0)
+				namlen = ep->d_type;
+			else
+				namlen = ep->d_namlen;
+#else
+			if (FSFMT(vdp) && needswap != 0)
+				namlen = ep->d_type;
+			else
+				namlen = ep->d_namlen;
+#endif
+			if (namlen == cnp->cn_namelen &&
+			    !memcmp(cnp->cn_nameptr, ep->d_name,
+			    (unsigned)namlen)) {
+#ifdef UFS_DIRHASH
+foundentry:
+#endif
+				/*
+				 * Save directory entry's inode number and
+				 * reclen in ndp->ni_ufs area, and release
+				 * directory buffer.
+				 */
+				if (!FSFMT(vdp) && ep->d_type == DT_WHT) {
+					slotstatus = FOUND;
+					slotoffset = results->ulr_offset;
+					slotsize = ufs_rw16(ep->d_reclen,
+					    needswap);
+					results->ulr_reclen = slotsize;
+					/*
+					 * This is used to set results->ulr_endoff,
+					 * which may be used by ufs_direnter2()
+					 * as a length to truncate the
+					 * directory to.  Therefore, it must
+					 * point past the end of the last
+					 * non-empty directory entry.  We don't
+					 * know where that is in this case, so
+					 * we effectively disable shrinking by
+					 * using the existing size of the
+					 * directory.
+					 *
+					 * Note that we wouldn't expect to
+					 * shrink the directory while rewriting
+					 * an existing entry anyway.
+					 */
+					enduseful = endsearch;
+					ap->a_cnp->cn_flags |= ISWHITEOUT;
+					numdirpasses--;
+					goto notfound;
+				}
+				foundino = ufs_rw32(ep->d_ino, needswap);
+				results->ulr_reclen = ufs_rw16(ep->d_reclen, needswap);
+				goto found;
+			}
+		}
+		prevoff = results->ulr_offset;
+		results->ulr_offset += ufs_rw16(ep->d_reclen, needswap);
+		entryoffsetinblock += ufs_rw16(ep->d_reclen, needswap);
+		if (ep->d_ino)
+			enduseful = results->ulr_offset;
+	}
+notfound:
+	/*
+	 * If we started in the middle of the directory and failed
+	 * to find our target, we must check the beginning as well.
+	 */
+	if (numdirpasses == 2) {
+		numdirpasses--;
+		results->ulr_offset = 0;
+		endsearch = results->ulr_diroff;
+		goto searchloop;
+	}
+	if (bp != NULL)
+		brelse(bp, 0);
+	/*
+	 * If creating, and at end of pathname and current
+	 * directory has not been removed, then can consider
+	 * allowing file to be created.
+	 */
+	if ((nameiop == CREATE || nameiop == RENAME ||
+	     (nameiop == DELETE &&
+	      (ap->a_cnp->cn_flags & DOWHITEOUT) &&
+	      (ap->a_cnp->cn_flags & ISWHITEOUT))) &&
+	    (flags & ISLASTCN) && dp->i_nlink != 0) {
+		/*
+		 * Access for write is interpreted as allowing
+		 * creation of files in the directory.
+		 */
+		error = VOP_ACCESS(vdp, VWRITE, cred);
+		if (error)
+			goto out;
+		/*
+		 * Return an indication of where the new directory
+		 * entry should be put.  If we didn't find a slot,
+		 * then set results->ulr_count to 0 indicating
+		 * that the new slot belongs at the end of the
+		 * directory. If we found a slot, then the new entry
+		 * can be put in the range from results->ulr_offset to
+		 * results->ulr_offset + results->ulr_count.
+		 */
+		if (slotstatus == NONE) {
+			results->ulr_offset = roundup(dp->i_size, dirblksiz);
+			results->ulr_count = 0;
+			enduseful = results->ulr_offset;
+		} else if (nameiop == DELETE) {
+			results->ulr_offset = slotoffset;
+			if ((results->ulr_offset & (dirblksiz - 1)) == 0)
+				results->ulr_count = 0;
+			else
+				results->ulr_count = results->ulr_offset - prevoff;
+		} else {
+			results->ulr_offset = slotoffset;
+			results->ulr_count = slotsize;
+			if (enduseful < slotoffset + slotsize)
+				enduseful = slotoffset + slotsize;
+		}
+		results->ulr_endoff = roundup(enduseful, dirblksiz);
+#if 0 /* commented out by dbj. none of the on disk fields changed */
+		dp->i_flag |= IN_CHANGE | IN_UPDATE;
+#endif
+		/*
+		 * We return with the directory locked, so that
+		 * the parameters we set up above will still be
+		 * valid if we actually decide to do a direnter().
+		 * We return ni_vp == NULL to indicate that the entry
+		 * does not currently exist; we leave a pointer to
+		 * the (locked) directory inode in ndp->ni_dvp.
+		 *
+		 * NB - if the directory is unlocked, then this
+		 * information cannot be used.
+		 */
+		error = EJUSTRETURN;
+		goto out;
+	}
+	/*
+	 * Insert name into cache (as non-existent) if appropriate.
+	 */
+	if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
+		cache_enter(vdp, *vpp, cnp);
+	error = ENOENT;
+	goto out;
+
+found:
+	if (numdirpasses == 2)
+		nchstats.ncs_pass2++;
+	/*
+	 * Check that directory length properly reflects presence
+	 * of this entry.
+	 */
+	if (results->ulr_offset + DIRSIZ(FSFMT(vdp), ep, needswap) > dp->i_size) {
+		ufs_dirbad(dp, results->ulr_offset, "i_size too small");
+		dp->i_size = results->ulr_offset + DIRSIZ(FSFMT(vdp), ep, needswap);
+		DIP_ASSIGN(dp, size, dp->i_size);
+		dp->i_flag |= IN_CHANGE | IN_UPDATE;
+		UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
+	}
+	brelse(bp, 0);
+
+	/*
+	 * Found component in pathname.
+	 * If the final component of path name, save information
+	 * in the cache as to where the entry was found.
+	 */
+	if ((flags & ISLASTCN) && nameiop == LOOKUP)
+		results->ulr_diroff = results->ulr_offset &~ (dirblksiz - 1);
+
+	/*
+	 * If deleting, and at end of pathname, return
+	 * parameters which can be used to remove file.
+	 * Lock the inode, being careful with ".".
+	 */
+	if (nameiop == DELETE && (flags & ISLASTCN)) {
+		/*
+		 * Write access to directory required to delete files.
+		 */
+		error = VOP_ACCESS(vdp, VWRITE, cred);
+		if (error)
+			goto out;
+		/*
+		 * Return pointer to current entry in results->ulr_offset,
+		 * and distance past previous entry (if there
+		 * is a previous entry in this block) in results->ulr_count.
+		 * Save directory inode pointer in ndp->ni_dvp for dirremove().
+		 */
+		if ((results->ulr_offset & (dirblksiz - 1)) == 0)
+			results->ulr_count = 0;
+		else
+			results->ulr_count = results->ulr_offset - prevoff;
+		if (dp->i_number == foundino) {
+			vref(vdp);
+			*vpp = vdp;
+			error = 0;
+			goto out;
+		}
+		if (flags & ISDOTDOT)
+			VOP_UNLOCK(vdp); /* race to get the inode */
+		error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+		if (flags & ISDOTDOT)
+			vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY);
+		if (error)
+			goto out;
+		/*
+		 * If directory is "sticky", then user must own
+		 * the directory, or the file in it, else she
+		 * may not delete it (unless she's root). This
+		 * implements append-only directories.
+		 */
+		if ((dp->i_mode & ISVTX) &&
+		    kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+		     NULL) != 0 &&
+		    kauth_cred_geteuid(cred) != dp->i_uid &&
+		    VTOI(tdp)->i_uid != kauth_cred_geteuid(cred)) {
+			vput(tdp);
+			error = EPERM;
+			goto out;
+		}
+		*vpp = tdp;
+		error = 0;
+		goto out;
+	}
+
+	/*
+	 * If rewriting (RENAME), return the inode and the
+	 * information required to rewrite the present directory
+	 * Must get inode of directory entry to verify it's a
+	 * regular file, or empty directory.
+	 */
+	if (nameiop == RENAME && (flags & ISLASTCN)) {
+		error = VOP_ACCESS(vdp, VWRITE, cred);
+		if (error)
+			goto out;
+		/*
+		 * Careful about locking second inode.
+		 * This can only occur if the target is ".".
+		 */
+		if (dp->i_number == foundino) {
+			error = EISDIR;
+			goto out;
+		}
+		if (flags & ISDOTDOT)
+			VOP_UNLOCK(vdp); /* race to get the inode */
+		error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+		if (flags & ISDOTDOT)
+			vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY);
+		if (error)
+			goto out;
+		*vpp = tdp;
+		error = 0;
+		goto out;
+	}
+
+	/*
+	 * Step through the translation in the name.  We do not `vput' the
+	 * directory because we may need it again if a symbolic link
+	 * is relative to the current directory.  Instead we save it
+	 * unlocked as "pdp".  We must get the target inode before unlocking
+	 * the directory to insure that the inode will not be removed
+	 * before we get it.  We prevent deadlock by always fetching
+	 * inodes from the root, moving down the directory tree. Thus
+	 * when following backward pointers ".." we must unlock the
+	 * parent directory before getting the requested directory.
+	 * There is a potential race condition here if both the current
+	 * and parent directories are removed before the VFS_VGET for the
+	 * inode associated with ".." returns.  We hope that this occurs
+	 * infrequently since we cannot avoid this race condition without
+	 * implementing a sophisticated deadlock detection algorithm.
+	 * Note also that this simple deadlock detection scheme will not
+	 * work if the file system has any hard links other than ".."
+	 * that point backwards in the directory structure.
+	 */
+	pdp = vdp;
+	if (flags & ISDOTDOT) {
+		VOP_UNLOCK(pdp);	/* race to get the inode */
+		error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+		vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY);
+		if (error) {
+			goto out;
+		}
+		*vpp = tdp;
+	} else if (dp->i_number == foundino) {
+		vref(vdp);	/* we want ourself, ie "." */
+		*vpp = vdp;
+	} else {
+		error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+		if (error)
+			goto out;
+		*vpp = tdp;
+	}
+
+	/*
+	 * Insert name into cache if appropriate.
+	 */
+	if (cnp->cn_flags & MAKEENTRY)
+		cache_enter(vdp, *vpp, cnp);
+	error = 0;
+
+out:
+	fstrans_done(vdp->v_mount);
+	return error;
+}
+
+void
+ufs_dirbad(struct inode *ip, doff_t offset, const char *how)
+{
+	struct mount *mp;
+
+	mp = ITOV(ip)->v_mount;
+	printf("%s: bad dir ino %llu at offset %d: %s\n",
+	    mp->mnt_stat.f_mntonname, (unsigned long long)ip->i_number,
+	    offset, how);
+	if ((mp->mnt_stat.f_flag & MNT_RDONLY) == 0)
+		panic("bad dir");
+}
+
+/*
+ * Do consistency checking on a directory entry:
+ *	record length must be multiple of 4
+ *	entry must fit in rest of its DIRBLKSIZ block
+ *	record must be large enough to contain entry
+ *	name is not longer than FFS_MAXNAMLEN
+ *	name must be as long as advertised, and null terminated
+ */
+int
+ufs_dirbadentry(struct vnode *dp, struct direct *ep, int entryoffsetinblock)
+{
+	int i;
+	int namlen;
+	struct ufsmount *ump = VFSTOUFS(dp->v_mount);
+	const int needswap = UFS_MPNEEDSWAP(ump);
+	int dirblksiz = ump->um_dirblksiz;
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+	if (FSFMT(dp) && needswap == 0)
+		namlen = ep->d_type;
+	else
+		namlen = ep->d_namlen;
+#else
+	if (FSFMT(dp) && needswap != 0)
+		namlen = ep->d_type;
+	else
+		namlen = ep->d_namlen;
+#endif
+	if ((ufs_rw16(ep->d_reclen, needswap) & 0x3) != 0 ||
+	    ufs_rw16(ep->d_reclen, needswap) >
+		dirblksiz - (entryoffsetinblock & (dirblksiz - 1)) ||
+	    ufs_rw16(ep->d_reclen, needswap) <
+		DIRSIZ(FSFMT(dp), ep, needswap) ||
+	    namlen > FFS_MAXNAMLEN) {
+		/*return (1); */
+		printf("First bad, reclen=%#x, DIRSIZ=%lu, namlen=%d, "
+			"flags=%#x, entryoffsetinblock=%d, dirblksiz = %d\n",
+			ufs_rw16(ep->d_reclen, needswap),
+			(u_long)DIRSIZ(FSFMT(dp), ep, needswap),
+			namlen, dp->v_mount->mnt_flag, entryoffsetinblock,
+			dirblksiz);
+		goto bad;
+	}
+	if (ep->d_ino == 0)
+		return (0);
+	for (i = 0; i < namlen; i++)
+		if (ep->d_name[i] == '\0') {
+			/*return (1); */
+			printf("Second bad\n");
+			goto bad;
+	}
+	if (ep->d_name[i])
+		goto bad;
+	return (0);
+bad:
+	return (1);
+}
+
+/*
+ * Construct a new directory entry after a call to namei, using the
+ * name in the componentname argument cnp. The argument ip is the
+ * inode to which the new directory entry will refer.
+ */
+void
+ufs_makedirentry(struct inode *ip, struct componentname *cnp,
+    struct direct *newdirp)
+{
+	newdirp->d_ino = ip->i_number;
+	newdirp->d_namlen = cnp->cn_namelen;
+	memcpy(newdirp->d_name, cnp->cn_nameptr, (size_t)cnp->cn_namelen);
+	newdirp->d_name[cnp->cn_namelen] = '\0';
+	if (FSFMT(ITOV(ip)))
+		newdirp->d_type = 0;
+	else
+		newdirp->d_type = IFTODT(ip->i_mode);
+}
+
+/*
+ * Write a directory entry after a call to namei, using the parameters
+ * that ufs_lookup left in nameidata and in the ufs_lookup_results.
+ *
+ * DVP is the directory to be updated. It must be locked.
+ * ULR is the ufs_lookup_results structure from the final lookup step.
+ * TVP is not used. (XXX: why is it here? remove it)
+ * DIRP is the new directory entry contents.
+ * CNP is the componentname from the final lookup step.
+ * NEWDIRBP is not used and (XXX) should be removed. The previous
+ * comment here said it was used by the now-removed softupdates code.
+ *
+ * The link count of the target inode is *not* incremented; the
+ * caller does that.
+ *
+ * If ulr->ulr_count is 0, ufs_lookup did not find space to insert the
+ * directory entry. ulr_offset, which is the place to put the entry,
+ * should be on a block boundary (and should be at the end of the
+ * directory AFAIK) and a fresh block is allocated to put the new
+ * directory entry in.
+ *
+ * If ulr->ulr_count is not zero, ufs_lookup found a slot to insert
+ * the entry into. This slot ranges from ulr_offset to ulr_offset +
+ * ulr_count. However, this slot may already be partially populated
+ * requiring compaction. See notes below.
+ *
+ * Furthermore, if ulr_count is not zero and ulr_endoff is not the
+ * same as i_size, the directory is truncated to size ulr_endoff.
+ */
+int
+ufs_direnter(struct vnode *dvp, const struct ufs_lookup_results *ulr,
+    struct vnode *tvp, struct direct *dirp,
+    struct componentname *cnp, struct buf *newdirbp)
+{
+	kauth_cred_t cr;
+	struct lwp *l;
+	int newentrysize;
+	struct inode *dp;
+	struct buf *bp;
+	u_int dsize;
+	struct direct *ep, *nep;
+	int error, ret, blkoff, loc, spacefree;
+	char *dirbuf;
+	struct timespec ts;
+	struct ufsmount *ump = VFSTOUFS(dvp->v_mount);
+	const int needswap = UFS_MPNEEDSWAP(ump);
+	int dirblksiz = ump->um_dirblksiz;
+
+	UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount);
+
+	error = 0;
+	cr = cnp->cn_cred;
+	l = curlwp;
+
+	dp = VTOI(dvp);
+	newentrysize = DIRSIZ(0, dirp, 0);
+
+#if 0
+	struct ufs_lookup_results *ulr;
+	/* XXX should handle this material another way */
+	ulr = &dp->i_crap;
+	UFS_CHECK_CRAPCOUNTER(dp);
+#endif
+
+	if (ulr->ulr_count == 0) {
+		/*
+		 * If ulr_count is 0, then namei could find no
+		 * space in the directory. Here, ulr_offset will
+		 * be on a directory block boundary and we will write the
+		 * new entry into a fresh block.
+		 */
+		if (ulr->ulr_offset & (dirblksiz - 1))
+			panic("ufs_direnter: newblk");
+		if ((error = UFS_BALLOC(dvp, (off_t)ulr->ulr_offset, dirblksiz,
+		    cr, B_CLRBUF | B_SYNC, &bp)) != 0) {
+			return (error);
+		}
+		dp->i_size = ulr->ulr_offset + dirblksiz;
+		DIP_ASSIGN(dp, size, dp->i_size);
+		dp->i_flag |= IN_CHANGE | IN_UPDATE;
+		uvm_vnp_setsize(dvp, dp->i_size);
+		dirp->d_reclen = ufs_rw16(dirblksiz, needswap);
+		dirp->d_ino = ufs_rw32(dirp->d_ino, needswap);
+		if (FSFMT(dvp)) {
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+			if (needswap == 0) {
+#else
+			if (needswap != 0) {
+#endif
+				u_char tmp = dirp->d_namlen;
+				dirp->d_namlen = dirp->d_type;
+				dirp->d_type = tmp;
+			}
+		}
+		blkoff = ulr->ulr_offset & (ump->um_mountp->mnt_stat.f_iosize - 1);
+		memcpy((char *)bp->b_data + blkoff, dirp, newentrysize);
+#ifdef UFS_DIRHASH
+		if (dp->i_dirhash != NULL) {
+			ufsdirhash_newblk(dp, ulr->ulr_offset);
+			ufsdirhash_add(dp, dirp, ulr->ulr_offset);
+			ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff,
+			    ulr->ulr_offset);
+		}
+#endif
+		error = VOP_BWRITE(bp->b_vp, bp);
+		vfs_timestamp(&ts);
+		ret = UFS_UPDATE(dvp, &ts, &ts, UPDATE_DIROP);
+		if (error == 0)
+			return (ret);
+		return (error);
+	}
+
+	/*
+	 * If ulr_count is non-zero, then namei found space for the new
+	 * entry in the range ulr_offset to url_offset + url_count
+	 * in the directory. To use this space, we may have to compact
+	 * the entries located there, by copying them together towards the
+	 * beginning of the block, leaving the free space in one usable
+	 * chunk at the end.
+	 */
+
+	/*
+	 * Increase size of directory if entry eats into new space.
+	 * This should never push the size past a new multiple of
+	 * DIRBLKSIZ.
+	 *
+	 * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
+	 */
+	if (ulr->ulr_offset + ulr->ulr_count > dp->i_size) {
+#ifdef DIAGNOSTIC
+		printf("ufs_direnter: reached 4.2-only block, "
+		       "not supposed to happen\n");
+#endif
+		dp->i_size = ulr->ulr_offset + ulr->ulr_count;
+		DIP_ASSIGN(dp, size, dp->i_size);
+		dp->i_flag |= IN_CHANGE | IN_UPDATE;
+		UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
+	}
+	/*
+	 * Get the block containing the space for the new directory entry.
+	 */
+	error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, &dirbuf, &bp, true);
+	if (error) {
+		return (error);
+	}
+	/*
+	 * Find space for the new entry. In the simple case, the entry at
+	 * offset base will have the space. If it does not, then namei
+	 * arranged that compacting the region dp->i_offset to
+	 * dp->i_offset + dp->i_count would yield the space.
+	 */
+	ep = (struct direct *)dirbuf;
+	dsize = (ep->d_ino != 0) ?  DIRSIZ(FSFMT(dvp), ep, needswap) : 0;
+	spacefree = ufs_rw16(ep->d_reclen, needswap) - dsize;
+	for (loc = ufs_rw16(ep->d_reclen, needswap); loc < ulr->ulr_count; ) {
+		uint16_t reclen;
+
+		nep = (struct direct *)(dirbuf + loc);
+
+		/* Trim the existing slot (NB: dsize may be zero). */
+		ep->d_reclen = ufs_rw16(dsize, needswap);
+		ep = (struct direct *)((char *)ep + dsize);
+
+		reclen = ufs_rw16(nep->d_reclen, needswap);
+		loc += reclen;
+		if (nep->d_ino == 0) {
+			/*
+			 * A mid-block unused entry. Such entries are
+			 * never created by the kernel, but fsck_ffs
+			 * can create them (and it doesn't fix them).
+			 *
+			 * Add up the free space, and initialise the
+			 * relocated entry since we don't memcpy it.
+			 */
+			spacefree += reclen;
+			ep->d_ino = 0;
+			dsize = 0;
+			continue;
+		}
+		dsize = DIRSIZ(FSFMT(dvp), nep, needswap);
+		spacefree += reclen - dsize;
+#ifdef UFS_DIRHASH
+		if (dp->i_dirhash != NULL)
+			ufsdirhash_move(dp, nep,
+			    ulr->ulr_offset + ((char *)nep - dirbuf),
+			    ulr->ulr_offset + ((char *)ep - dirbuf));
+#endif
+		memcpy((void *)ep, (void *)nep, dsize);
+	}
+	/*
+	 * Here, `ep' points to a directory entry containing `dsize' in-use
+	 * bytes followed by `spacefree' unused bytes. If ep->d_ino == 0,
+	 * then the entry is completely unused (dsize == 0). The value
+	 * of ep->d_reclen is always indeterminate.
+	 *
+	 * Update the pointer fields in the previous entry (if any),
+	 * copy in the new entry, and write out the block.
+	 */
+	if (ep->d_ino == 0 ||
+	    (ufs_rw32(ep->d_ino, needswap) == WINO &&
+	     memcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) {
+		if (spacefree + dsize < newentrysize)
+			panic("ufs_direnter: compact1");
+		dirp->d_reclen = spacefree + dsize;
+	} else {
+		if (spacefree < newentrysize)
+			panic("ufs_direnter: compact2");
+		dirp->d_reclen = spacefree;
+		ep->d_reclen = ufs_rw16(dsize, needswap);
+		ep = (struct direct *)((char *)ep + dsize);
+	}
+	dirp->d_reclen = ufs_rw16(dirp->d_reclen, needswap);
+	dirp->d_ino = ufs_rw32(dirp->d_ino, needswap);
+	if (FSFMT(dvp)) {
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+		if (needswap == 0) {
+#else
+		if (needswap != 0) {
+#endif
+			u_char tmp = dirp->d_namlen;
+			dirp->d_namlen = dirp->d_type;
+			dirp->d_type = tmp;
+		}
+	}
+#ifdef UFS_DIRHASH
+	if (dp->i_dirhash != NULL && (ep->d_ino == 0 ||
+	    dirp->d_reclen == spacefree))
+		ufsdirhash_add(dp, dirp, ulr->ulr_offset + ((char *)ep - dirbuf));
+#endif
+	memcpy((void *)ep, (void *)dirp, (u_int)newentrysize);
+#ifdef UFS_DIRHASH
+	if (dp->i_dirhash != NULL)
+		ufsdirhash_checkblock(dp, dirbuf -
+		    (ulr->ulr_offset & (dirblksiz - 1)),
+		    ulr->ulr_offset & ~(dirblksiz - 1));
+#endif
+	error = VOP_BWRITE(bp->b_vp, bp);
+	dp->i_flag |= IN_CHANGE | IN_UPDATE;
+	/*
+	 * If all went well, and the directory can be shortened, proceed
+	 * with the truncation. Note that we have to unlock the inode for
+	 * the entry that we just entered, as the truncation may need to
+	 * lock other inodes which can lead to deadlock if we also hold a
+	 * lock on the newly entered node.
+	 */
+	if (error == 0 && ulr->ulr_endoff && ulr->ulr_endoff < dp->i_size) {
+#ifdef UFS_DIRHASH
+		if (dp->i_dirhash != NULL)
+			ufsdirhash_dirtrunc(dp, ulr->ulr_endoff);
+#endif
+		(void) UFS_TRUNCATE(dvp, (off_t)ulr->ulr_endoff, IO_SYNC, cr);
+	}
+	UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
+	return (error);
+}
+
+/*
+ * Remove a directory entry after a call to namei, using the
+ * parameters that ufs_lookup left in nameidata and in the
+ * ufs_lookup_results.
+ *
+ * DVP is the directory to be updated. It must be locked.
+ * ULR is the ufs_lookup_results structure from the final lookup step.
+ * IP, if not null, is the inode being unlinked.
+ * FLAGS may contain DOWHITEOUT.
+ * ISRMDIR is not used and (XXX) should be removed.
+ *
+ * If FLAGS contains DOWHITEOUT the entry is replaced with a whiteout
+ * instead of being cleared.
+ *
+ * ulr->ulr_offset contains the position of the directory entry
+ * to be removed.
+ *
+ * ulr->ulr_reclen contains the size of the directory entry to be
+ * removed.
+ *
+ * ulr->ulr_count contains the size of the *previous* directory
+ * entry. This allows finding it, for free space management. If
+ * ulr_count is 0, the target entry is at the beginning of the
+ * directory. (Does this ever happen? The first entry should be ".",
+ * which should only be removed at rmdir time. Does rmdir come here
+ * to clear out the "." and ".." entries? Perhaps, but I doubt it.)
+ *
+ * The space is marked free by adding it to the record length (not
+ * name length) of the preceding entry. If the first entry becomes
+ * free, it is marked free by setting the inode number to 0.
+ *
+ * The link count of IP is decremented. Note that this is not the
+ * inverse behavior of ufs_direnter, which does not adjust link
+ * counts. Sigh.
+ */
+int
+ufs_dirremove(struct vnode *dvp, const struct ufs_lookup_results *ulr,
+	      struct inode *ip, int flags, int isrmdir)
+{
+	struct inode *dp = VTOI(dvp);
+	struct direct *ep;
+	struct buf *bp;
+	int error;
+#ifdef FFS_EI
+	const int needswap = UFS_MPNEEDSWAP(dp->i_ump);
+#endif
+
+	UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount);
+
+	if (flags & DOWHITEOUT) {
+		/*
+		 * Whiteout entry: set d_ino to WINO.
+		 */
+		error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, (void *)&ep,
+				     &bp, true);
+		if (error)
+			return (error);
+		ep->d_ino = ufs_rw32(WINO, needswap);
+		ep->d_type = DT_WHT;
+		goto out;
+	}
+
+	if ((error = ufs_blkatoff(dvp,
+	    (off_t)(ulr->ulr_offset - ulr->ulr_count), (void *)&ep, &bp, true)) != 0)
+		return (error);
+
+#ifdef UFS_DIRHASH
+	/*
+	 * Remove the dirhash entry. This is complicated by the fact
+	 * that `ep' is the previous entry when dp->i_count != 0.
+	 */
+	if (dp->i_dirhash != NULL)
+		ufsdirhash_remove(dp, (ulr->ulr_count == 0) ? ep :
+		   (struct direct *)((char *)ep +
+		   ufs_rw16(ep->d_reclen, needswap)), ulr->ulr_offset);
+#endif
+
+	if (ulr->ulr_count == 0) {
+		/*
+		 * First entry in block: set d_ino to zero.
+		 */
+		ep->d_ino = 0;
+	} else {
+		/*
+		 * Collapse new free space into previous entry.
+		 */
+		ep->d_reclen =
+		    ufs_rw16(ufs_rw16(ep->d_reclen, needswap) + ulr->ulr_reclen,
+			needswap);
+	}
+
+#ifdef UFS_DIRHASH
+	if (dp->i_dirhash != NULL) {
+		int dirblksiz = ip->i_ump->um_dirblksiz;
+		ufsdirhash_checkblock(dp, (char *)ep -
+		    ((ulr->ulr_offset - ulr->ulr_count) & (dirblksiz - 1)),
+		    ulr->ulr_offset & ~(dirblksiz - 1));
+	}
+#endif
+
+out:
+	if (ip) {
+		ip->i_nlink--;
+		DIP_ASSIGN(ip, nlink, ip->i_nlink);
+		ip->i_flag |= IN_CHANGE;
+		UFS_WAPBL_UPDATE(ITOV(ip), NULL, NULL, 0);
+	}
+	error = VOP_BWRITE(bp->b_vp, bp);
+	dp->i_flag |= IN_CHANGE | IN_UPDATE;
+	/*
+	 * If the last named reference to a snapshot goes away,
+	 * drop its snapshot reference so that it will be reclaimed
+	 * when last open reference goes away.
+	 */
+	if (ip != 0 && (ip->i_flags & SF_SNAPSHOT) != 0 &&
+	    ip->i_nlink == 0)
+		ffs_snapgone(ip);
+	UFS_WAPBL_UPDATE(dvp, NULL, NULL, 0);
+	return (error);
+}
+
+/*
+ * Rewrite an existing directory entry to point at the inode supplied.
+ *
+ * DP is the directory to update.
+ * OFFSET is the position of the entry in question. It may come
+ * from ulr_offset of a ufs_lookup_results.
+ * OIP is the old inode the directory previously pointed to.
+ * NEWINUM is the number of the new inode.
+ * NEWTYPE is the new value for the type field of the directory entry.
+ * (This is ignored if the fs doesn't support that.)
+ * ISRMDIR is not used and (XXX) should be removed.
+ * IFLAGS are added to DP's inode flags.
+ *
+ * The link count of OIP is decremented. Note that the link count of
+ * the new inode is *not* incremented. Yay for symmetry.
+ */
+int
+ufs_dirrewrite(struct inode *dp, off_t offset,
+    struct inode *oip, ino_t newinum, int newtype,
+    int isrmdir, int iflags)
+{
+	struct buf *bp;
+	struct direct *ep;
+	struct vnode *vdp = ITOV(dp);
+	int error;
+
+	error = ufs_blkatoff(vdp, offset, (void *)&ep, &bp, true);
+	if (error)
+		return (error);
+	ep->d_ino = ufs_rw32(newinum, UFS_MPNEEDSWAP(dp->i_ump));
+	if (!FSFMT(vdp))
+		ep->d_type = newtype;
+	oip->i_nlink--;
+	DIP_ASSIGN(oip, nlink, oip->i_nlink);
+	oip->i_flag |= IN_CHANGE;
+	UFS_WAPBL_UPDATE(ITOV(oip), NULL, NULL, UPDATE_DIROP);
+	error = VOP_BWRITE(bp->b_vp, bp);
+	dp->i_flag |= iflags;
+	/*
+	 * If the last named reference to a snapshot goes away,
+	 * drop its snapshot reference so that it will be reclaimed
+	 * when last open reference goes away.
+	 */
+	if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_nlink == 0)
+		ffs_snapgone(oip);
+	UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
+	return (error);
+}
+
+/*
+ * Check if a directory is empty or not.
+ * Inode supplied must be locked.
+ *
+ * Using a struct dirtemplate here is not precisely
+ * what we want, but better than using a struct direct.
+ *
+ * NB: does not handle corrupted directories.
+ */
+int
+ufs_dirempty(struct inode *ip, ino_t parentino, kauth_cred_t cred)
+{
+	doff_t off;
+	struct dirtemplate dbuf;
+	struct direct *dp = (struct direct *)&dbuf;
+	int error, namlen;
+	size_t count;
+	const int needswap = UFS_IPNEEDSWAP(ip);
+#define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
+
+	for (off = 0; off < ip->i_size;
+	    off += ufs_rw16(dp->d_reclen, needswap)) {
+		error = vn_rdwr(UIO_READ, ITOV(ip), (void *)dp, MINDIRSIZ, off,
+		   UIO_SYSSPACE, IO_NODELOCKED, cred, &count, NULL);
+		/*
+		 * Since we read MINDIRSIZ, residual must
+		 * be 0 unless we're at end of file.
+		 */
+		if (error || count != 0)
+			return (0);
+		/* avoid infinite loops */
+		if (dp->d_reclen == 0)
+			return (0);
+		/* skip empty entries */
+		if (dp->d_ino == 0 || ufs_rw32(dp->d_ino, needswap) == WINO)
+			continue;
+		/* accept only "." and ".." */
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+		if (FSFMT(ITOV(ip)) && needswap == 0)
+			namlen = dp->d_type;
+		else
+			namlen = dp->d_namlen;
+#else
+		if (FSFMT(ITOV(ip)) && needswap != 0)
+			namlen = dp->d_type;
+		else
+			namlen = dp->d_namlen;
+#endif
+		if (namlen > 2)
+			return (0);
+		if (dp->d_name[0] != '.')
+			return (0);
+		/*
+		 * At this point namlen must be 1 or 2.
+		 * 1 implies ".", 2 implies ".." if second
+		 * char is also "."
+		 */
+		if (namlen == 1 &&
+		    ufs_rw32(dp->d_ino, needswap) == ip->i_number)
+			continue;
+		if (dp->d_name[1] == '.' &&
+		    ufs_rw32(dp->d_ino, needswap) == parentino)
+			continue;
+		return (0);
+	}
+	return (1);
+}
+
+/*
+ * Check if source directory is in the path of the target directory.
+ * Target is supplied locked, source is unlocked.
+ * The target is always vput before returning.
+ */
+int
+ufs_checkpath(struct inode *source, struct inode *target, kauth_cred_t cred)
+{
+	struct vnode *nextvp, *vp;
+	int error, rootino, namlen;
+	struct dirtemplate dirbuf;
+	const int needswap = UFS_MPNEEDSWAP(target->i_ump);
+
+	vp = ITOV(target);
+	if (target->i_number == source->i_number) {
+		error = EEXIST;
+		goto out;
+	}
+	rootino = ROOTINO;
+	error = 0;
+	if (target->i_number == rootino)
+		goto out;
+
+	for (;;) {
+		if (vp->v_type != VDIR) {
+			error = ENOTDIR;
+			break;
+		}
+		error = vn_rdwr(UIO_READ, vp, (void *)&dirbuf,
+		    sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE,
+		    IO_NODELOCKED, cred, NULL, NULL);
+		if (error != 0)
+			break;
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+		if (FSFMT(vp) && needswap == 0)
+			namlen = dirbuf.dotdot_type;
+		else
+			namlen = dirbuf.dotdot_namlen;
+#else
+		if (FSFMT(vp) && needswap != 0)
+			namlen = dirbuf.dotdot_type;
+		else
+			namlen = dirbuf.dotdot_namlen;
+#endif
+		if (namlen != 2 ||
+		    dirbuf.dotdot_name[0] != '.' ||
+		    dirbuf.dotdot_name[1] != '.') {
+			error = ENOTDIR;
+			break;
+		}
+		if (ufs_rw32(dirbuf.dotdot_ino, needswap) == source->i_number) {
+			error = EINVAL;
+			break;
+		}
+		if (ufs_rw32(dirbuf.dotdot_ino, needswap) == rootino)
+			break;
+		VOP_UNLOCK(vp);
+		error = VFS_VGET(vp->v_mount,
+		    ufs_rw32(dirbuf.dotdot_ino, needswap), &nextvp);
+		vrele(vp);
+		if (error) {
+			vp = NULL;
+			break;
+		}
+		vp = nextvp;
+	}
+
+out:
+	if (error == ENOTDIR)
+		printf("checkpath: .. not a directory\n");
+	if (vp != NULL)
+		vput(vp);
+	return (error);
+}
+
+/*
+ * Extract the inode number of ".." from a directory.
+ * Helper for ufs_parentcheck.
+ */
+static int
+ufs_readdotdot(struct vnode *vp, int needswap, kauth_cred_t cred, ino_t *result)
+{
+	struct dirtemplate dirbuf;
+	int namlen, error;
+
+	error = vn_rdwr(UIO_READ, vp, &dirbuf,
+		    sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE,
+		    IO_NODELOCKED, cred, NULL, NULL);
+	if (error) {
+		return error;
+	}
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+	if (FSFMT(vp) && needswap == 0)
+		namlen = dirbuf.dotdot_type;
+	else
+		namlen = dirbuf.dotdot_namlen;
+#else
+	if (FSFMT(vp) && needswap != 0)
+		namlen = dirbuf.dotdot_type;
+	else
+		namlen = dirbuf.dotdot_namlen;
+#endif
+	if (namlen != 2 ||
+	    dirbuf.dotdot_name[0] != '.' ||
+	    dirbuf.dotdot_name[1] != '.') {
+		printf("ufs_readdotdot: directory %llu contains "
+		       "garbage instead of ..\n",
+		       (unsigned long long) VTOI(vp)->i_number);
+		return ENOTDIR;
+	}
+	*result = ufs_rw32(dirbuf.dotdot_ino, needswap);
+	return 0;
+}
+
+/*
+ * Check if LOWER is a descendent of UPPER. If we find UPPER, return
+ * nonzero in FOUND and return a reference to the immediate descendent
+ * of UPPER in UPPERCHILD. If we don't find UPPER (that is, if we
+ * reach the volume root and that isn't UPPER), return zero in FOUND
+ * and null in UPPERCHILD.
+ *
+ * Neither UPPER nor LOWER should be locked.
+ *
+ * On error (such as a permissions error checking up the directory
+ * tree) fail entirely.
+ *
+ * Note that UPPER and LOWER must be on the same volume, and because
+ * we inspect only that volume NEEDSWAP can be constant.
+ */
+int
+ufs_parentcheck(struct vnode *upper, struct vnode *lower, kauth_cred_t cred,
+		int *found_ret, struct vnode **upperchild_ret)
+{
+	const int needswap = UFS_MPNEEDSWAP(VTOI(lower)->i_ump);
+	ino_t upper_ino, found_ino;
+	struct vnode *current, *next;
+	int error;
+
+	if (upper == lower) {
+		vref(upper);
+		*found_ret = 1;
+		*upperchild_ret = upper;
+		return 0;
+	}
+	if (VTOI(lower)->i_number == ROOTINO) {
+		*found_ret = 0;
+		*upperchild_ret = NULL;
+		return 0;
+	}
+
+	upper_ino = VTOI(upper)->i_number;
+
+	current = lower;
+	vref(current);
+	vn_lock(current, LK_EXCLUSIVE | LK_RETRY);
+
+	for (;;) {
+		error = ufs_readdotdot(current, needswap, cred, &found_ino);
+		if (error) {
+			vput(current);
+			return error;
+		}
+		if (found_ino == upper_ino) {
+			VOP_UNLOCK(current);
+			*found_ret = 1;
+			*upperchild_ret = current;
+			return 0;
+		}
+		if (found_ino == ROOTINO) {
+			vput(current);
+			*found_ret = 0;
+			*upperchild_ret = NULL;
+			return 0;
+		}
+		VOP_UNLOCK(current);
+		error = VFS_VGET(current->v_mount, found_ino, &next);
+		if (error) {
+			vrele(current);
+			return error;
+		}
+		KASSERT(VOP_ISLOCKED(next));
+		if (next->v_type != VDIR) {
+			printf("ufs_parentcheck: inode %llu reached via .. of "
+			       "inode %llu is not a directory\n",
+			    (unsigned long long)VTOI(next)->i_number,
+			    (unsigned long long)VTOI(current)->i_number);
+			vput(next);
+			vrele(current);
+			return ENOTDIR;
+		}
+		vrele(current);
+		current = next;
+	}
+
+	return 0;
+}
+
+#define	UFS_DIRRABLKS 0
+int ufs_dirrablks = UFS_DIRRABLKS;
+
+/*
+ * ufs_blkatoff: Return buffer with the contents of block "offset" from
+ * the beginning of directory "vp".  If "res" is non-zero, fill it in with
+ * a pointer to the remaining space in the directory.  If the caller intends
+ * to modify the buffer returned, "modify" must be true.
+ */
+
+int
+ufs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp,
+    bool modify)
+{
+	struct inode *ip;
+	struct buf *bp;
+	daddr_t lbn;
+	const int dirrablks = ufs_dirrablks;
+	daddr_t *blks;
+	int *blksizes;
+	int run, error;
+	struct mount *mp = vp->v_mount;
+	const int bshift = mp->mnt_fs_bshift;
+	const int bsize = 1 << bshift;
+	off_t eof;
+
+	blks = kmem_alloc((1 + dirrablks) * sizeof(daddr_t), KM_SLEEP);
+	blksizes = kmem_alloc((1 + dirrablks) * sizeof(int), KM_SLEEP);
+	ip = VTOI(vp);
+	KASSERT(vp->v_size == ip->i_size);
+	GOP_SIZE(vp, vp->v_size, &eof, 0);
+	lbn = offset >> bshift;
+
+	for (run = 0; run <= dirrablks;) {
+		const off_t curoff = lbn << bshift;
+		const int size = MIN(eof - curoff, bsize);
+
+		if (size == 0) {
+			break;
+		}
+		KASSERT(curoff < eof);
+		blks[run] = lbn;
+		blksizes[run] = size;
+		lbn++;
+		run++;
+		if (size != bsize) {
+			break;
+		}
+	}
+	KASSERT(run >= 1);
+	error = breadn(vp, blks[0], blksizes[0], &blks[1], &blksizes[1],
+	    run - 1, NOCRED, (modify ? B_MODIFY : 0), &bp);
+	if (error != 0) {
+		brelse(bp, 0);
+		*bpp = NULL;
+		goto out;
+	}
+	if (res) {
+		*res = (char *)bp->b_data + (offset & (bsize - 1));
+	}
+	*bpp = bp;
+
+ out:
+	kmem_free(blks, (1 + dirrablks) * sizeof(daddr_t));
+	kmem_free(blksizes, (1 + dirrablks) * sizeof(int));
+	return error;
+}
diff --git a/sys/ufs/ufs/ufs_quota.c b/sys/ufs/ufs/ufs_quota.c
new file mode 100644
index 000000000..78cef57e1
--- /dev/null
+++ b/sys/ufs/ufs/ufs_quota.c
@@ -0,0 +1,877 @@
+/*	$NetBSD: ufs_quota.c,v 1.70 2011/03/24 17:05:46 bouyer Exp $	*/
+
+/*
+ * Copyright (c) 1982, 1986, 1990, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Robert Elz at The University of Melbourne.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_quota.c	8.5 (Berkeley) 5/20/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_quota.c,v 1.70 2011/03/24 17:05:46 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif 
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_quota.h>
+#include <quota/quotaprop.h>
+
+kmutex_t dqlock;
+kcondvar_t dqcv;
+
+/*
+ * Code pertaining to management of the in-core dquot data structures.
+ */
+#define DQHASH(dqvp, id) \
+	(((((long)(dqvp)) >> 8) + id) & dqhash)
+static LIST_HEAD(dqhashhead, dquot) *dqhashtbl;
+static u_long dqhash;
+static pool_cache_t dquot_cache;
+
+
+static int quota_handle_cmd_get_version(struct mount *, struct lwp *,
+    prop_dictionary_t, prop_array_t);
+static int quota_handle_cmd_get(struct mount *, struct lwp *,
+    prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_set(struct mount *, struct lwp *,
+    prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_getall(struct mount *, struct lwp *,
+    prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_clear(struct mount *, struct lwp *,
+    prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_quotaon(struct mount *, struct lwp *, 
+    prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_quotaoff(struct mount *, struct lwp *, 
+    prop_dictionary_t, int, prop_array_t);
+/*
+ * Initialize the quota fields of an inode.
+ */
+void
+ufsquota_init(struct inode *ip)
+{
+	int i;
+
+	for (i = 0; i < MAXQUOTAS; i++)
+		ip->i_dquot[i] = NODQUOT;
+}
+
+/*
+ * Release the quota fields from an inode.
+ */
+void
+ufsquota_free(struct inode *ip)
+{
+	int i;
+
+	for (i = 0; i < MAXQUOTAS; i++) {
+		dqrele(ITOV(ip), ip->i_dquot[i]);
+		ip->i_dquot[i] = NODQUOT;
+	}
+}
+
+/*
+ * Update disk usage, and take corrective action.
+ */
+int
+chkdq(struct inode *ip, int64_t change, kauth_cred_t cred, int flags)
+{
+	/* do not track snapshot usage, or we will deadlock */
+	if ((ip->i_flags & SF_SNAPSHOT) != 0)
+		return 0;
+
+#ifdef QUOTA
+	if (ip->i_ump->um_flags & UFS_QUOTA)
+		return chkdq1(ip, change, cred, flags);
+#endif
+#ifdef QUOTA2
+	if (ip->i_ump->um_flags & UFS_QUOTA2)
+		return chkdq2(ip, change, cred, flags);
+#endif
+	return 0;
+}
+
+/*
+ * Check the inode limit, applying corrective action.
+ */
+int
+chkiq(struct inode *ip, int32_t change, kauth_cred_t cred, int flags)
+{
+	/* do not track snapshot usage, or we will deadlock */
+	if ((ip->i_flags & SF_SNAPSHOT) != 0)
+		return 0;
+#ifdef QUOTA
+	if (ip->i_ump->um_flags & UFS_QUOTA)
+		return chkiq1(ip, change, cred, flags);
+#endif
+#ifdef QUOTA2
+	if (ip->i_ump->um_flags & UFS_QUOTA2)
+		return chkiq2(ip, change, cred, flags);
+#endif
+	return 0;
+}
+
+int
+quota_handle_cmd(struct mount *mp, struct lwp *l, prop_dictionary_t cmddict)
+{
+	int error = 0;
+	const char *cmd, *type;
+	prop_array_t datas;
+	int q2type;
+
+	if (!prop_dictionary_get_cstring_nocopy(cmddict, "command", &cmd))
+		return EINVAL;
+	if (!prop_dictionary_get_cstring_nocopy(cmddict, "type", &type))
+		return EINVAL;
+	if (!strcmp(type, QUOTADICT_CLASS_USER)) {
+		q2type = USRQUOTA;
+	} else if (!strcmp(type, QUOTADICT_CLASS_GROUP)) {
+		q2type = GRPQUOTA;
+	} else
+		return EOPNOTSUPP;
+	datas = prop_dictionary_get(cmddict, "data");
+	if (datas == NULL || prop_object_type(datas) != PROP_TYPE_ARRAY)
+		return EINVAL;
+
+	prop_object_retain(datas);
+	prop_dictionary_remove(cmddict, "data"); /* prepare for return */
+
+	if (strcmp(cmd, "get version") == 0) {
+		error = quota_handle_cmd_get_version(mp, l, cmddict, datas);
+		goto end;
+	}
+	if (strcmp(cmd, "quotaon") == 0) {
+		error = quota_handle_cmd_quotaon(mp, l, cmddict,
+		    q2type, datas);
+		goto end;
+	}
+	if (strcmp(cmd, "quotaoff") == 0) {
+		error = quota_handle_cmd_quotaoff(mp, l, cmddict,
+		    q2type, datas);
+		goto end;
+	}
+	if (strcmp(cmd, "get") == 0) {
+		error = quota_handle_cmd_get(mp, l, cmddict, q2type, datas);
+		goto end;
+	}
+	if (strcmp(cmd, "set") == 0) {
+		error = quota_handle_cmd_set(mp, l, cmddict, q2type, datas);
+		goto end;
+	}
+	if (strcmp(cmd, "getall") == 0) {
+		error = quota_handle_cmd_getall(mp, l, cmddict, q2type, datas);
+		goto end;
+	}
+	if (strcmp(cmd, "clear") == 0) {
+		error = quota_handle_cmd_clear(mp, l, cmddict, q2type, datas);
+		goto end;
+	}
+	error = EOPNOTSUPP;
+end:
+	error = (prop_dictionary_set_int8(cmddict, "return",
+	    error) ? 0 : ENOMEM);
+	prop_object_release(datas);
+	return error;
+}
+
+static int 
+quota_handle_cmd_get_version(struct mount *mp, struct lwp *l, 
+    prop_dictionary_t cmddict, prop_array_t datas)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	prop_array_t replies;
+	prop_dictionary_t data;
+	int error = 0;
+
+	if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0)
+		return EOPNOTSUPP;
+
+	replies = prop_array_create();
+	if (replies == NULL)
+		return ENOMEM;
+
+	data = prop_dictionary_create();
+	if (data == NULL) {
+		prop_object_release(replies);
+		return ENOMEM;
+	}
+
+#ifdef QUOTA
+	if (ump->um_flags & UFS_QUOTA) {
+		if (!prop_dictionary_set_int8(data, "version", 1))
+			error = ENOMEM;
+	} else
+#endif
+#ifdef QUOTA2
+	if (ump->um_flags & UFS_QUOTA2) {
+		if (!prop_dictionary_set_int8(data, "version", 2))
+			error = ENOMEM;
+	} else
+#endif
+		error = 0;
+	if (error)
+		prop_object_release(data);
+	else if (!prop_array_add_and_rel(replies, data))
+		error = ENOMEM;
+	if (error)
+		prop_object_release(replies);
+	else if (!prop_dictionary_set_and_rel(cmddict, "data", replies))
+		error = ENOMEM;
+	return error;
+}
+
+/* XXX shouldn't all this be in kauth ? */
+static int
+quota_get_auth(struct mount *mp, struct lwp *l, uid_t id) {
+	/* The user can always query about his own quota. */
+	if (id == kauth_cred_getuid(l->l_cred))
+		return 0;
+	return kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+	    KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, KAUTH_ARG(id), NULL);
+}
+
+static int 
+quota_handle_cmd_get(struct mount *mp, struct lwp *l, 
+    prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+	prop_array_t replies;
+	prop_object_iterator_t iter;
+	prop_dictionary_t data;
+	uint32_t id;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	int error, defaultq = 0;
+	const char *idstr;
+
+	if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0)
+		return EOPNOTSUPP;
+	
+	replies = prop_array_create();
+	if (replies == NULL)
+		return ENOMEM;
+
+	iter = prop_array_iterator(datas);
+	if (iter == NULL) {
+		prop_object_release(replies);
+		return ENOMEM;
+	}
+	while ((data = prop_object_iterator_next(iter)) != NULL) {
+		if (!prop_dictionary_get_uint32(data, "id", &id)) {
+			if (!prop_dictionary_get_cstring_nocopy(data, "id",
+			    &idstr))
+				continue;
+			if (strcmp(idstr, "default")) {
+				error = EINVAL;
+				goto err;
+			}
+			id = 0;
+			defaultq = 1;
+		} else {
+			defaultq = 0;
+		}
+		error = quota_get_auth(mp, l, id);
+		if (error == EPERM)
+			continue;
+		if (error != 0) 
+			goto err;
+#ifdef QUOTA
+		if (ump->um_flags & UFS_QUOTA)
+			error = quota1_handle_cmd_get(ump, type, id, defaultq,
+			    replies);
+		else
+#endif
+#ifdef QUOTA2
+		if (ump->um_flags & UFS_QUOTA2) {
+			error = quota2_handle_cmd_get(ump, type, id, defaultq,
+			    replies);
+		} else
+#endif
+			panic("quota_handle_cmd_get: no support ?");
+		
+		if (error == ENOENT)
+			continue;
+		if (error != 0)
+			goto err;
+	}
+	prop_object_iterator_release(iter);
+	if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) {
+		error = ENOMEM;
+	} else {
+		error = 0;
+	}
+	return error;
+err:
+	prop_object_iterator_release(iter);
+	prop_object_release(replies);
+	return error;
+}
+
+static int 
+quota_handle_cmd_set(struct mount *mp, struct lwp *l, 
+    prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+	prop_array_t replies;
+	prop_object_iterator_t iter;
+	prop_dictionary_t data;
+	uint32_t id;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	int error, defaultq = 0;
+	const char *idstr;
+
+	if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0)
+		return EOPNOTSUPP;
+	
+	replies = prop_array_create();
+	if (replies == NULL)
+		return ENOMEM;
+
+	iter = prop_array_iterator(datas);
+	if (iter == NULL) {
+		prop_object_release(replies);
+		return ENOMEM;
+	}
+	while ((data = prop_object_iterator_next(iter)) != NULL) {
+		if (!prop_dictionary_get_uint32(data, "id", &id)) {
+			if (!prop_dictionary_get_cstring_nocopy(data, "id",
+			    &idstr))
+				continue;
+			if (strcmp(idstr, "default"))
+				continue;
+			id = 0;
+			defaultq = 1;
+		} else {
+			defaultq = 0;
+		}
+		error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+		    KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(id), NULL);
+		if (error != 0)
+			goto err;
+#ifdef QUOTA
+		if (ump->um_flags & UFS_QUOTA)
+			error = quota1_handle_cmd_set(ump, type, id, defaultq,
+			    data);
+		else
+#endif
+#ifdef QUOTA2
+		if (ump->um_flags & UFS_QUOTA2) {
+			error = quota2_handle_cmd_set(ump, type, id, defaultq,
+			    data);
+		} else
+#endif
+			panic("quota_handle_cmd_get: no support ?");
+		
+		if (error && error != ENOENT)
+			goto err;
+	}
+	prop_object_iterator_release(iter);
+	if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) {
+		error = ENOMEM;
+	} else {
+		error = 0;
+	}
+	return error;
+err:
+	prop_object_iterator_release(iter);
+	prop_object_release(replies);
+	return error;
+}
+
+static int 
+quota_handle_cmd_clear(struct mount *mp, struct lwp *l, 
+    prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+	prop_array_t replies;
+	prop_object_iterator_t iter;
+	prop_dictionary_t data;
+	uint32_t id;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	int error, defaultq = 0;
+	const char *idstr;
+
+	if ((ump->um_flags & UFS_QUOTA2) == 0)
+		return EOPNOTSUPP;
+	
+	replies = prop_array_create();
+	if (replies == NULL)
+		return ENOMEM;
+
+	iter = prop_array_iterator(datas);
+	if (iter == NULL) {
+		prop_object_release(replies);
+		return ENOMEM;
+	}
+	while ((data = prop_object_iterator_next(iter)) != NULL) {
+		if (!prop_dictionary_get_uint32(data, "id", &id)) {
+			if (!prop_dictionary_get_cstring_nocopy(data, "id",
+			    &idstr))
+				continue;
+			if (strcmp(idstr, "default"))
+				continue;
+			id = 0;
+			defaultq = 1;
+		} else {
+			defaultq = 0;
+		}
+		error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+		    KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(id), NULL);
+		if (error != 0)
+			goto err;
+#ifdef QUOTA2
+		if (ump->um_flags & UFS_QUOTA2) {
+			error = quota2_handle_cmd_clear(ump, type, id, defaultq,
+			    data);
+		} else
+#endif
+			panic("quota_handle_cmd_get: no support ?");
+		
+		if (error && error != ENOENT)
+			goto err;
+	}
+	prop_object_iterator_release(iter);
+	if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) {
+		error = ENOMEM;
+	} else {
+		error = 0;
+	}
+	return error;
+err:
+	prop_object_iterator_release(iter);
+	prop_object_release(replies);
+	return error;
+}
+
+static int 
+quota_handle_cmd_getall(struct mount *mp, struct lwp *l, 
+    prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+	prop_array_t replies;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	int error;
+
+	if ((ump->um_flags & UFS_QUOTA2) == 0)
+		return EOPNOTSUPP;
+	
+	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+	    KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, NULL, NULL);
+	if (error)
+		return error;
+		
+	replies = prop_array_create();
+	if (replies == NULL)
+		return ENOMEM;
+
+#ifdef QUOTA2
+	if (ump->um_flags & UFS_QUOTA2) {
+		error = quota2_handle_cmd_getall(ump, type, replies);
+	} else
+#endif
+		panic("quota_handle_cmd_getall: no support ?");
+	if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) {
+		error = ENOMEM;
+	} else {
+		error = 0;
+	}
+	return error;
+}
+
+static int 
+quota_handle_cmd_quotaon(struct mount *mp, struct lwp *l, 
+    prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+	prop_dictionary_t data;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	int error;
+	const char *qfile;
+
+	if ((ump->um_flags & UFS_QUOTA2) != 0)
+		return EBUSY;
+	
+	if (prop_array_count(datas) != 1)
+		return EINVAL;
+
+	data = prop_array_get(datas, 0);
+	if (data == NULL)
+		return ENOMEM;
+	if (!prop_dictionary_get_cstring_nocopy(data, "quotafile",
+	    &qfile))
+		return EINVAL;
+
+	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+	    KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL);
+	if (error != 0) {
+		return error;
+	}
+#ifdef QUOTA
+	error = quota1_handle_cmd_quotaon(l, ump, type, qfile);
+#else
+	error = EOPNOTSUPP;
+#endif
+	
+	return error;
+}
+
+static int 
+quota_handle_cmd_quotaoff(struct mount *mp, struct lwp *l, 
+    prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	int error;
+
+	if ((ump->um_flags & UFS_QUOTA2) != 0)
+		return EOPNOTSUPP;
+	
+	if (prop_array_count(datas) != 0)
+		return EINVAL;
+
+	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+	    KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL);
+	if (error != 0) {
+		return error;
+	}
+#ifdef QUOTA
+	error = quota1_handle_cmd_quotaoff(l, ump, type);
+#else
+	error = EOPNOTSUPP;
+#endif
+	
+	return error;
+}
+
+/*
+ * Initialize the quota system.
+ */
+void
+dqinit(void)
+{
+
+	mutex_init(&dqlock, MUTEX_DEFAULT, IPL_NONE);
+	cv_init(&dqcv, "quota");
+	dqhashtbl = hashinit(desiredvnodes, HASH_LIST, true, &dqhash);
+	dquot_cache = pool_cache_init(sizeof(struct dquot), 0, 0, 0, "ufsdq",
+	    NULL, IPL_NONE, NULL, NULL, NULL);
+}
+
+void
+dqreinit(void)
+{
+	struct dquot *dq;
+	struct dqhashhead *oldhash, *hash;
+	struct vnode *dqvp;
+	u_long oldmask, mask, hashval;
+	int i;
+
+	hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
+	mutex_enter(&dqlock);
+	oldhash = dqhashtbl;
+	oldmask = dqhash;
+	dqhashtbl = hash;
+	dqhash = mask;
+	for (i = 0; i <= oldmask; i++) {
+		while ((dq = LIST_FIRST(&oldhash[i])) != NULL) {
+			dqvp = dq->dq_ump->um_quotas[dq->dq_type];
+			LIST_REMOVE(dq, dq_hash);
+			hashval = DQHASH(dqvp, dq->dq_id);
+			LIST_INSERT_HEAD(&dqhashtbl[hashval], dq, dq_hash);
+		}
+	}
+	mutex_exit(&dqlock);
+	hashdone(oldhash, HASH_LIST, oldmask);
+}
+
+/*
+ * Free resources held by quota system.
+ */
+void
+dqdone(void)
+{
+
+	pool_cache_destroy(dquot_cache);
+	hashdone(dqhashtbl, HASH_LIST, dqhash);
+	cv_destroy(&dqcv);
+	mutex_destroy(&dqlock);
+}
+
+/*
+ * Set up the quotas for an inode.
+ *
+ * This routine completely defines the semantics of quotas.
+ * If other criterion want to be used to establish quotas, the
+ * MAXQUOTAS value in quotas.h should be increased, and the
+ * additional dquots set up here.
+ */
+int
+getinoquota(struct inode *ip)
+{
+	struct ufsmount *ump = ip->i_ump;
+	struct vnode *vp = ITOV(ip);
+	int i, error;
+	u_int32_t ino_ids[MAXQUOTAS];
+
+	/*
+	 * To avoid deadlocks never update quotas for quota files
+	 * on the same file system
+	 */
+	for (i = 0; i < MAXQUOTAS; i++)
+		if (vp == ump->um_quotas[i])
+			return 0;
+
+	ino_ids[USRQUOTA] = ip->i_uid;
+	ino_ids[GRPQUOTA] = ip->i_gid;
+	for (i = 0; i < MAXQUOTAS; i++) {
+		/*
+		 * If the file id changed the quota needs update.
+		 */
+		if (ip->i_dquot[i] != NODQUOT &&
+		    ip->i_dquot[i]->dq_id != ino_ids[i]) {
+			dqrele(ITOV(ip), ip->i_dquot[i]);
+			ip->i_dquot[i] = NODQUOT;
+		}
+		/*
+		 * Set up the quota based on file id.
+		 * ENODEV means that quotas are not enabled.
+		 */
+		if (ip->i_dquot[i] == NODQUOT &&
+		    (error = dqget(vp, ino_ids[i], ump, i, &ip->i_dquot[i])) &&
+		    error != ENODEV)
+			return (error);
+	}
+	return 0;
+}
+
+/*
+ * Obtain a dquot structure for the specified identifier and quota file
+ * reading the information from the file if necessary.
+ */
+int
+dqget(struct vnode *vp, u_long id, struct ufsmount *ump, int type,
+    struct dquot **dqp)
+{
+	struct dquot *dq, *ndq;
+	struct dqhashhead *dqh;
+	struct vnode *dqvp;
+	int error = 0; /* XXX gcc */
+
+	/* Lock to see an up to date value for QTF_CLOSING. */
+	mutex_enter(&dqlock);
+	if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0) {
+		mutex_exit(&dqlock);
+		*dqp = NODQUOT;
+		return (ENODEV);
+	}
+	dqvp = ump->um_quotas[type];
+#ifdef QUOTA
+	if (ump->um_flags & UFS_QUOTA) {
+		if (dqvp == NULLVP || (ump->umq1_qflags[type] & QTF_CLOSING)) {
+			mutex_exit(&dqlock);
+			*dqp = NODQUOT;
+			return (ENODEV);
+		}
+	}
+#endif
+#ifdef QUOTA2
+	if (ump->um_flags & UFS_QUOTA2) {
+		if (dqvp == NULLVP) {
+			mutex_exit(&dqlock);
+			*dqp = NODQUOT;
+			return (ENODEV);
+		}
+	}
+#endif
+	KASSERT(dqvp != vp);
+	/*
+	 * Check the cache first.
+	 */
+	dqh = &dqhashtbl[DQHASH(dqvp, id)];
+	LIST_FOREACH(dq, dqh, dq_hash) {
+		if (dq->dq_id != id ||
+		    dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
+			continue;
+		KASSERT(dq->dq_cnt > 0);
+		dqref(dq);
+		mutex_exit(&dqlock);
+		*dqp = dq;
+		return (0);
+	}
+	/*
+	 * Not in cache, allocate a new one.
+	 */
+	mutex_exit(&dqlock);
+	ndq = pool_cache_get(dquot_cache, PR_WAITOK);
+	/*
+	 * Initialize the contents of the dquot structure.
+	 */
+	memset((char *)ndq, 0, sizeof *ndq);
+	ndq->dq_flags = 0;
+	ndq->dq_id = id;
+	ndq->dq_ump = ump;
+	ndq->dq_type = type;
+	mutex_init(&ndq->dq_interlock, MUTEX_DEFAULT, IPL_NONE);
+	mutex_enter(&dqlock);
+	dqh = &dqhashtbl[DQHASH(dqvp, id)];
+	LIST_FOREACH(dq, dqh, dq_hash) {
+		if (dq->dq_id != id ||
+		    dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
+			continue;
+		/*
+		 * Another thread beat us allocating this dquot.
+		 */
+		KASSERT(dq->dq_cnt > 0);
+		dqref(dq);
+		mutex_exit(&dqlock);
+		mutex_destroy(&ndq->dq_interlock);
+		pool_cache_put(dquot_cache, ndq);
+		*dqp = dq;
+		return 0;
+	}
+	dq = ndq;
+	LIST_INSERT_HEAD(dqh, dq, dq_hash);
+	dqref(dq);
+	mutex_enter(&dq->dq_interlock);
+	mutex_exit(&dqlock);
+#ifdef QUOTA
+	if (ump->um_flags & UFS_QUOTA)
+		error = dq1get(dqvp, id, ump, type, dq);
+#endif
+#ifdef QUOTA2
+	if (ump->um_flags & UFS_QUOTA2)
+		error = dq2get(dqvp, id, ump, type, dq);
+#endif
+	/*
+	 * I/O error in reading quota file, release
+	 * quota structure and reflect problem to caller.
+	 */
+	if (error) {
+		mutex_enter(&dqlock);
+		LIST_REMOVE(dq, dq_hash);
+		mutex_exit(&dqlock);
+		mutex_exit(&dq->dq_interlock);
+		dqrele(vp, dq);
+		*dqp = NODQUOT;
+		return (error);
+	}
+	mutex_exit(&dq->dq_interlock);
+	*dqp = dq;
+	return (0);
+}
+
+/*
+ * Obtain a reference to a dquot.
+ */
+void
+dqref(struct dquot *dq)
+{
+
+	KASSERT(mutex_owned(&dqlock));
+	dq->dq_cnt++;
+	KASSERT(dq->dq_cnt > 0);
+}
+
+/*
+ * Release a reference to a dquot.
+ */
+void
+dqrele(struct vnode *vp, struct dquot *dq)
+{
+
+	if (dq == NODQUOT)
+		return;
+	mutex_enter(&dq->dq_interlock);
+	for (;;) {
+		mutex_enter(&dqlock);
+		if (dq->dq_cnt > 1) {
+			dq->dq_cnt--;
+			mutex_exit(&dqlock);
+			mutex_exit(&dq->dq_interlock);
+			return;
+		}
+		if ((dq->dq_flags & DQ_MOD) == 0)
+			break;
+		mutex_exit(&dqlock);
+#ifdef QUOTA
+		if (dq->dq_ump->um_flags & UFS_QUOTA)
+			(void) dq1sync(vp, dq);
+#endif
+#ifdef QUOTA2
+		if (dq->dq_ump->um_flags & UFS_QUOTA2)
+			(void) dq2sync(vp, dq);
+#endif
+	}
+	KASSERT(dq->dq_cnt == 1 && (dq->dq_flags & DQ_MOD) == 0);
+	LIST_REMOVE(dq, dq_hash);
+	mutex_exit(&dqlock);
+	mutex_exit(&dq->dq_interlock);
+	mutex_destroy(&dq->dq_interlock);
+	pool_cache_put(dquot_cache, dq);
+}
+
+int
+qsync(struct mount *mp)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+#ifdef QUOTA
+	if (ump->um_flags & UFS_QUOTA)
+		return q1sync(mp);
+#endif
+#ifdef QUOTA2
+	if (ump->um_flags & UFS_QUOTA2)
+		return q2sync(mp);
+#endif
+	return 0;
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * Check the hash chains for stray dquot's.
+ */
+void
+dqflush(struct vnode *vp)
+{
+	struct dquot *dq;
+	int i;
+
+	mutex_enter(&dqlock);
+	for (i = 0; i <= dqhash; i++)
+		LIST_FOREACH(dq, &dqhashtbl[i], dq_hash)
+			KASSERT(dq->dq_ump->um_quotas[dq->dq_type] != vp);
+	mutex_exit(&dqlock);
+}
+#endif
diff --git a/include/ufs/ufs/ufs_quota.h b/sys/ufs/ufs/ufs_quota.h
similarity index 100%
rename from include/ufs/ufs/ufs_quota.h
rename to sys/ufs/ufs/ufs_quota.h
diff --git a/sys/ufs/ufs/ufs_quota1.c b/sys/ufs/ufs/ufs_quota1.c
new file mode 100644
index 000000000..4fdb57c95
--- /dev/null
+++ b/sys/ufs/ufs/ufs_quota1.c
@@ -0,0 +1,885 @@
+/*	$NetBSD: ufs_quota1.c,v 1.6 2011/11/25 16:55:05 dholland Exp $	*/
+
+/*
+ * Copyright (c) 1982, 1986, 1990, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Robert Elz at The University of Melbourne.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_quota.c	8.5 (Berkeley) 5/20/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_quota1.c,v 1.6 2011/11/25 16:55:05 dholland Exp $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+
+#include <quota/quotaprop.h>
+#include <ufs/ufs/quota1.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_quota.h>
+
+static int chkdqchg(struct inode *, int64_t, kauth_cred_t, int);
+static int chkiqchg(struct inode *, int32_t, kauth_cred_t, int);
+
+/*
+ * Update disk usage, and take corrective action.
+ */
+int
+chkdq1(struct inode *ip, int64_t change, kauth_cred_t cred, int flags)
+{
+	struct dquot *dq;
+	int i;
+	int ncurblocks, error;
+
+	if ((error = getinoquota(ip)) != 0)
+		return error;
+	if (change == 0)
+		return (0);
+	if (change < 0) {
+		for (i = 0; i < MAXQUOTAS; i++) {
+			if ((dq = ip->i_dquot[i]) == NODQUOT)
+				continue;
+			mutex_enter(&dq->dq_interlock);
+			ncurblocks = dq->dq_curblocks + change;
+			if (ncurblocks >= 0)
+				dq->dq_curblocks = ncurblocks;
+			else
+				dq->dq_curblocks = 0;
+			dq->dq_flags &= ~DQ_WARN(QL_BLOCK);
+			dq->dq_flags |= DQ_MOD;
+			mutex_exit(&dq->dq_interlock);
+		}
+		return (0);
+	}
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if ((dq = ip->i_dquot[i]) == NODQUOT)
+			continue;
+		if ((flags & FORCE) == 0 &&
+		    kauth_authorize_system(cred, KAUTH_SYSTEM_FS_QUOTA,
+		    KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT, KAUTH_ARG(i),
+		    KAUTH_ARG(QL_BLOCK), NULL) != 0) {
+			mutex_enter(&dq->dq_interlock);
+			error = chkdqchg(ip, change, cred, i);
+			mutex_exit(&dq->dq_interlock);
+			if (error != 0)
+				return (error);
+		}
+	}
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if ((dq = ip->i_dquot[i]) == NODQUOT)
+			continue;
+		mutex_enter(&dq->dq_interlock);
+		dq->dq_curblocks += change;
+		dq->dq_flags |= DQ_MOD;
+		mutex_exit(&dq->dq_interlock);
+	}
+	return (0);
+}
+
+/*
+ * Check for a valid change to a users allocation.
+ * Issue an error message if appropriate.
+ */
+static int
+chkdqchg(struct inode *ip, int64_t change, kauth_cred_t cred, int type)
+{
+	struct dquot *dq = ip->i_dquot[type];
+	long ncurblocks = dq->dq_curblocks + change;
+
+	KASSERT(mutex_owned(&dq->dq_interlock));
+	/*
+	 * If user would exceed their hard limit, disallow space allocation.
+	 */
+	if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) {
+		if ((dq->dq_flags & DQ_WARN(QL_BLOCK)) == 0 &&
+		    ip->i_uid == kauth_cred_geteuid(cred)) {
+			uprintf("\n%s: write failed, %s disk limit reached\n",
+			    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+			    quotatypes[type]);
+			dq->dq_flags |= DQ_WARN(QL_BLOCK);
+		}
+		return (EDQUOT);
+	}
+	/*
+	 * If user is over their soft limit for too long, disallow space
+	 * allocation. Reset time limit as they cross their soft limit.
+	 */
+	if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) {
+		if (dq->dq_curblocks < dq->dq_bsoftlimit) {
+			dq->dq_btime =
+			    time_second + ip->i_ump->umq1_btime[type];
+			if (ip->i_uid == kauth_cred_geteuid(cred))
+				uprintf("\n%s: warning, %s %s\n",
+				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+				    quotatypes[type], "disk quota exceeded");
+			return (0);
+		}
+		if (time_second > dq->dq_btime) {
+			if ((dq->dq_flags & DQ_WARN(QL_BLOCK)) == 0 &&
+			    ip->i_uid == kauth_cred_geteuid(cred)) {
+				uprintf("\n%s: write failed, %s %s\n",
+				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+				    quotatypes[type],
+				    "disk quota exceeded for too long");
+				dq->dq_flags |= DQ_WARN(QL_BLOCK);
+			}
+			return (EDQUOT);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Check the inode limit, applying corrective action.
+ */
+int
+chkiq1(struct inode *ip, int32_t change, kauth_cred_t cred, int flags)
+{
+	struct dquot *dq;
+	int i;
+	int ncurinodes, error;
+
+	if ((error = getinoquota(ip)) != 0)
+		return error;
+	if (change == 0)
+		return (0);
+	if (change < 0) {
+		for (i = 0; i < MAXQUOTAS; i++) {
+			if ((dq = ip->i_dquot[i]) == NODQUOT)
+				continue;
+			mutex_enter(&dq->dq_interlock);
+			ncurinodes = dq->dq_curinodes + change;
+			if (ncurinodes >= 0)
+				dq->dq_curinodes = ncurinodes;
+			else
+				dq->dq_curinodes = 0;
+			dq->dq_flags &= ~DQ_WARN(QL_FILE);
+			dq->dq_flags |= DQ_MOD;
+			mutex_exit(&dq->dq_interlock);
+		}
+		return (0);
+	}
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if ((dq = ip->i_dquot[i]) == NODQUOT)
+			continue;
+		if ((flags & FORCE) == 0 && kauth_authorize_system(cred,
+		    KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT,
+		    KAUTH_ARG(i), KAUTH_ARG(QL_FILE), NULL) != 0) {
+			mutex_enter(&dq->dq_interlock);
+			error = chkiqchg(ip, change, cred, i);
+			mutex_exit(&dq->dq_interlock);
+			if (error != 0)
+				return (error);
+		}
+	}
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if ((dq = ip->i_dquot[i]) == NODQUOT)
+			continue;
+		mutex_enter(&dq->dq_interlock);
+		dq->dq_curinodes += change;
+		dq->dq_flags |= DQ_MOD;
+		mutex_exit(&dq->dq_interlock);
+	}
+	return (0);
+}
+
+/*
+ * Check for a valid change to a users allocation.
+ * Issue an error message if appropriate.
+ */
+static int
+chkiqchg(struct inode *ip, int32_t change, kauth_cred_t cred, int type)
+{
+	struct dquot *dq = ip->i_dquot[type];
+	long ncurinodes = dq->dq_curinodes + change;
+
+	KASSERT(mutex_owned(&dq->dq_interlock));
+	/*
+	 * If user would exceed their hard limit, disallow inode allocation.
+	 */
+	if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) {
+		if ((dq->dq_flags & DQ_WARN(QL_FILE)) == 0 &&
+		    ip->i_uid == kauth_cred_geteuid(cred)) {
+			uprintf("\n%s: write failed, %s inode limit reached\n",
+			    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+			    quotatypes[type]);
+			dq->dq_flags |= DQ_WARN(QL_FILE);
+		}
+		return (EDQUOT);
+	}
+	/*
+	 * If user is over their soft limit for too long, disallow inode
+	 * allocation. Reset time limit as they cross their soft limit.
+	 */
+	if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) {
+		if (dq->dq_curinodes < dq->dq_isoftlimit) {
+			dq->dq_itime =
+			    time_second + ip->i_ump->umq1_itime[type];
+			if (ip->i_uid == kauth_cred_geteuid(cred))
+				uprintf("\n%s: warning, %s %s\n",
+				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+				    quotatypes[type], "inode quota exceeded");
+			return (0);
+		}
+		if (time_second > dq->dq_itime) {
+			if ((dq->dq_flags & DQ_WARN(QL_FILE)) == 0 &&
+			    ip->i_uid == kauth_cred_geteuid(cred)) {
+				uprintf("\n%s: write failed, %s %s\n",
+				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+				    quotatypes[type],
+				    "inode quota exceeded for too long");
+				dq->dq_flags |= DQ_WARN(QL_FILE);
+			}
+			return (EDQUOT);
+		}
+	}
+	return (0);
+}
+
+int
+quota1_umount(struct mount *mp, int flags)
+{
+	int i, error;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct lwp *l = curlwp;
+
+	if ((ump->um_flags & UFS_QUOTA) == 0)
+		return 0;
+
+	if ((error = vflush(mp, NULLVP, SKIPSYSTEM | flags)) != 0)
+		return (error);
+
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (ump->um_quotas[i] != NULLVP) {
+			quota1_handle_cmd_quotaoff(l, ump, i);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Code to process quotactl commands.
+ */
+
+/*
+ * set up a quota file for a particular file system.
+ */
+int
+quota1_handle_cmd_quotaon(struct lwp *l, struct ufsmount *ump, int type,
+    const char *fname)
+{
+	struct mount *mp = ump->um_mountp;
+	struct vnode *vp, **vpp, *mvp;
+	struct dquot *dq;
+	int error;
+	struct pathbuf *pb;
+	struct nameidata nd;
+
+	if (ump->um_flags & UFS_QUOTA2) {
+		uprintf("%s: quotas v2 already enabled\n",
+		    mp->mnt_stat.f_mntonname);
+		return (EBUSY);
+	}
+		
+	if (mp->mnt_wapbl != NULL) {
+		printf("%s: quota v1 cannot be used with -o log\n",
+		    mp->mnt_stat.f_mntonname);
+		return (EOPNOTSUPP);
+	}
+
+	vpp = &ump->um_quotas[type];
+
+	pb = pathbuf_create(fname);
+	if (pb == NULL) {
+		return ENOMEM;
+	}
+	NDINIT(&nd, LOOKUP, FOLLOW, pb);
+	if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) {
+		pathbuf_destroy(pb);
+		return error;
+	}
+	vp = nd.ni_vp;
+	pathbuf_destroy(pb);
+
+	VOP_UNLOCK(vp);
+	if (vp->v_type != VREG) {
+		(void) vn_close(vp, FREAD|FWRITE, l->l_cred);
+		return (EACCES);
+	}
+	if (*vpp != vp)
+		quota1_handle_cmd_quotaoff(l, ump, type);
+	mutex_enter(&dqlock);
+	while ((ump->umq1_qflags[type] & (QTF_CLOSING | QTF_OPENING)) != 0)
+		cv_wait(&dqcv, &dqlock);
+	ump->umq1_qflags[type] |= QTF_OPENING;
+	mutex_exit(&dqlock);
+	mp->mnt_flag |= MNT_QUOTA;
+	vp->v_vflag |= VV_SYSTEM;	/* XXXSMP */
+	*vpp = vp;
+	/*
+	 * Save the credential of the process that turned on quotas.
+	 * Set up the time limits for this quota.
+	 */
+	kauth_cred_hold(l->l_cred);
+	ump->um_cred[type] = l->l_cred;
+	ump->umq1_btime[type] = MAX_DQ_TIME;
+	ump->umq1_itime[type] = MAX_IQ_TIME;
+	if (dqget(NULLVP, 0, ump, type, &dq) == 0) {
+		if (dq->dq_btime > 0)
+			ump->umq1_btime[type] = dq->dq_btime;
+		if (dq->dq_itime > 0)
+			ump->umq1_itime[type] = dq->dq_itime;
+		dqrele(NULLVP, dq);
+	}
+	/* Allocate a marker vnode. */
+	mvp = vnalloc(mp);
+	/*
+	 * Search vnodes associated with this mount point,
+	 * adding references to quota file being opened.
+	 * NB: only need to add dquot's for inodes being modified.
+	 */
+	mutex_enter(&mntvnode_lock);
+again:
+	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+		vmark(mvp, vp);
+		mutex_enter(vp->v_interlock);
+		if (VTOI(vp) == NULL || vp->v_mount != mp || vismarker(vp) ||
+		    vp->v_type == VNON || vp->v_writecount == 0 ||
+		    (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0) {
+			mutex_exit(vp->v_interlock);
+			continue;
+		}
+		mutex_exit(&mntvnode_lock);
+		if (vget(vp, LK_EXCLUSIVE)) {
+			mutex_enter(&mntvnode_lock);
+			(void)vunmark(mvp);
+			goto again;
+		}
+		if ((error = getinoquota(VTOI(vp))) != 0) {
+			vput(vp);
+			mutex_enter(&mntvnode_lock);
+			(void)vunmark(mvp);
+			break;
+		}
+		vput(vp);
+		mutex_enter(&mntvnode_lock);
+	}
+	mutex_exit(&mntvnode_lock);
+	vnfree(mvp);
+
+	mutex_enter(&dqlock);
+	ump->umq1_qflags[type] &= ~QTF_OPENING;
+	cv_broadcast(&dqcv);
+	if (error == 0)
+		ump->um_flags |= UFS_QUOTA;
+	mutex_exit(&dqlock);
+	if (error)
+		quota1_handle_cmd_quotaoff(l, ump, type);
+	return (error);
+}
+
+/*
+ * turn off disk quotas for a filesystem.
+ */
+int
+quota1_handle_cmd_quotaoff(struct lwp *l, struct ufsmount *ump, int type)
+{
+	struct mount *mp = ump->um_mountp;
+	struct vnode *vp;
+	struct vnode *qvp, *mvp;
+	struct dquot *dq;
+	struct inode *ip;
+	kauth_cred_t cred;
+	int i, error;
+
+	/* Allocate a marker vnode. */
+	mvp = vnalloc(mp);
+
+	mutex_enter(&dqlock);
+	while ((ump->umq1_qflags[type] & (QTF_CLOSING | QTF_OPENING)) != 0)
+		cv_wait(&dqcv, &dqlock);
+	if ((qvp = ump->um_quotas[type]) == NULLVP) {
+		mutex_exit(&dqlock);
+		vnfree(mvp);
+		return (0);
+	}
+	ump->umq1_qflags[type] |= QTF_CLOSING;
+	ump->um_flags &= ~UFS_QUOTA;
+	mutex_exit(&dqlock);
+	/*
+	 * Search vnodes associated with this mount point,
+	 * deleting any references to quota file being closed.
+	 */
+	mutex_enter(&mntvnode_lock);
+again:
+	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+		vmark(mvp, vp);
+		mutex_enter(vp->v_interlock);
+		if (VTOI(vp) == NULL || vp->v_mount != mp || vismarker(vp) ||
+		    vp->v_type == VNON ||
+		    (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0) {
+			mutex_exit(vp->v_interlock);
+			continue;
+		}
+		mutex_exit(&mntvnode_lock);
+		if (vget(vp, LK_EXCLUSIVE)) {
+			mutex_enter(&mntvnode_lock);
+			(void)vunmark(mvp);
+			goto again;
+		}
+		ip = VTOI(vp);
+		dq = ip->i_dquot[type];
+		ip->i_dquot[type] = NODQUOT;
+		dqrele(vp, dq);
+		vput(vp);
+		mutex_enter(&mntvnode_lock);
+	}
+	mutex_exit(&mntvnode_lock);
+#ifdef DIAGNOSTIC
+	dqflush(qvp);
+#endif
+	qvp->v_vflag &= ~VV_SYSTEM;
+	error = vn_close(qvp, FREAD|FWRITE, l->l_cred);
+	mutex_enter(&dqlock);
+	ump->um_quotas[type] = NULLVP;
+	cred = ump->um_cred[type];
+	ump->um_cred[type] = NOCRED;
+	for (i = 0; i < MAXQUOTAS; i++)
+		if (ump->um_quotas[i] != NULLVP)
+			break;
+	ump->umq1_qflags[type] &= ~QTF_CLOSING;
+	cv_broadcast(&dqcv);
+	mutex_exit(&dqlock);
+	kauth_cred_free(cred);
+	if (i == MAXQUOTAS)
+		mp->mnt_flag &= ~MNT_QUOTA;
+	return (error);
+}
+
+int             
+quota1_handle_cmd_get(struct ufsmount *ump, int type, int id,
+    int defaultq, prop_array_t replies)
+{
+	struct dquot *dq;
+	struct quotaval qv[QUOTA_NLIMITS];
+	prop_dictionary_t dict;
+	int error;
+	uint64_t *valuesp[QUOTA_NLIMITS];
+	valuesp[QUOTA_LIMIT_BLOCK] = &qv[QUOTA_LIMIT_BLOCK].qv_hardlimit;
+	valuesp[QUOTA_LIMIT_FILE] = &qv[QUOTA_LIMIT_FILE].qv_hardlimit;
+
+
+	if (ump->um_quotas[type] == NULLVP)
+		return ENODEV;
+
+	if (defaultq) { /* we want the grace period of id 0 */
+		if ((error = dqget(NULLVP, 0, ump, type, &dq)) != 0)
+			return error;
+
+	} else {
+		if ((error = dqget(NULLVP, id, ump, type, &dq)) != 0)
+			return error;
+	}
+	dqblk_to_quotaval(&dq->dq_un.dq1_dqb, qv);
+	dqrele(NULLVP, dq);
+	if (defaultq) {
+		if (qv[QUOTA_LIMIT_BLOCK].qv_expiretime > 0)
+			qv[QUOTA_LIMIT_BLOCK].qv_grace =
+			    qv[QUOTA_LIMIT_BLOCK].qv_expiretime;
+		else
+			qv[QUOTA_LIMIT_BLOCK].qv_grace = MAX_DQ_TIME;
+		if (qv[QUOTA_LIMIT_FILE].qv_expiretime > 0)
+			qv[QUOTA_LIMIT_FILE].qv_grace =
+			    qv[QUOTA_LIMIT_FILE].qv_expiretime;
+		else
+			qv[QUOTA_LIMIT_FILE].qv_grace = MAX_DQ_TIME;
+	}
+	dict = quota64toprop(id, defaultq, valuesp,
+	    ufs_quota_entry_names, UFS_QUOTA_NENTRIES,
+	    ufs_quota_limit_names, QUOTA_NLIMITS);
+	if (dict == NULL)
+		return ENOMEM;
+	if (!prop_array_add_and_rel(replies, dict))
+		return ENOMEM;
+	return 0;
+}
+
+int
+quota1_handle_cmd_set(struct ufsmount *ump, int type, int id,
+    int defaultq, prop_dictionary_t data)
+{
+	struct dquot *dq;
+	struct dqblk dqb;
+	int error;
+	uint64_t bval[2];
+	uint64_t ival[2];
+	const char *val_limitsonly_grace[] = {QUOTADICT_LIMIT_GTIME};
+#define Q1_GTIME 0
+	const char *val_limitsonly_softhard[] =
+	    {QUOTADICT_LIMIT_SOFT, QUOTADICT_LIMIT_HARD};
+#define Q1_SOFT 0
+#define Q1_HARD 1
+
+	uint64_t *valuesp[QUOTA_NLIMITS];
+	valuesp[QUOTA_LIMIT_BLOCK] = bval;
+	valuesp[QUOTA_LIMIT_FILE] = ival;
+
+	if (ump->um_quotas[type] == NULLVP)
+		return ENODEV;
+
+	if (defaultq) {
+		/* just update grace times */
+		error = proptoquota64(data, valuesp, val_limitsonly_grace, 1,
+		    ufs_quota_limit_names, QUOTA_NLIMITS);
+		if (error)
+			return error;
+		if ((error = dqget(NULLVP, id, ump, type, &dq)) != 0)
+			return error;
+		mutex_enter(&dq->dq_interlock);
+		if (bval[Q1_GTIME] > 0)
+			ump->umq1_btime[type] = dq->dq_btime =
+			    bval[Q1_GTIME];
+		if (ival[Q1_GTIME] > 0)
+			ump->umq1_itime[type] = dq->dq_itime =
+			    ival[Q1_GTIME];
+		mutex_exit(&dq->dq_interlock);
+		dq->dq_flags |= DQ_MOD;
+		dqrele(NULLVP, dq);
+		return 0;
+	}
+	error = proptoquota64(data, valuesp, val_limitsonly_softhard, 2,
+	    ufs_quota_limit_names, QUOTA_NLIMITS);
+	if (error)
+		return error;
+
+	if ((error = dqget(NULLVP, id, ump, type, &dq)) != 0)
+		return (error);
+	mutex_enter(&dq->dq_interlock);
+	/*
+	 * Copy all but the current values.
+	 * Reset time limit if previously had no soft limit or were
+	 * under it, but now have a soft limit and are over it.
+	 */
+	dqb.dqb_curblocks = dq->dq_curblocks;
+	dqb.dqb_curinodes = dq->dq_curinodes;
+	dqb.dqb_btime = dq->dq_btime;
+	dqb.dqb_itime = dq->dq_itime;
+	dqb.dqb_bsoftlimit = (bval[Q1_SOFT] == UQUAD_MAX) ? 0 : bval[Q1_SOFT];
+	dqb.dqb_bhardlimit = (bval[Q1_HARD] == UQUAD_MAX) ? 0 : bval[Q1_HARD];
+	dqb.dqb_isoftlimit = (ival[Q1_SOFT] == UQUAD_MAX) ? 0 : ival[Q1_SOFT];
+	dqb.dqb_ihardlimit = (ival[Q1_HARD] == UQUAD_MAX) ? 0 : ival[Q1_HARD];
+	if (dq->dq_id == 0) {
+		/* also update grace time if available */
+		if (proptoquota64(data, valuesp, val_limitsonly_grace, 1,
+		    ufs_quota_limit_names, QUOTA_NLIMITS) == 0) {
+			if (bval[Q1_GTIME] > 0)
+				ump->umq1_btime[type] = dqb.dqb_btime =
+				    bval[Q1_GTIME];
+			if (ival[Q1_GTIME] > 0)
+				ump->umq1_itime[type] = dqb.dqb_itime =
+				    ival[Q1_GTIME];
+		}
+	}
+	if (dqb.dqb_bsoftlimit &&
+	    dq->dq_curblocks >= dqb.dqb_bsoftlimit &&
+	    (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit))
+		dqb.dqb_btime = time_second + ump->umq1_btime[type];
+	if (dqb.dqb_isoftlimit &&
+	    dq->dq_curinodes >= dqb.dqb_isoftlimit &&
+	    (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit))
+		dqb.dqb_itime = time_second + ump->umq1_itime[type];
+	dq->dq_un.dq1_dqb = dqb;
+	if (dq->dq_curblocks < dq->dq_bsoftlimit)
+		dq->dq_flags &= ~DQ_WARN(QL_BLOCK);
+	if (dq->dq_curinodes < dq->dq_isoftlimit)
+		dq->dq_flags &= ~DQ_WARN(QL_FILE);
+	if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
+	    dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
+		dq->dq_flags |= DQ_FAKE;
+	else
+		dq->dq_flags &= ~DQ_FAKE;
+	dq->dq_flags |= DQ_MOD;
+	mutex_exit(&dq->dq_interlock);
+	dqrele(NULLVP, dq);
+	return (0);
+}
+
+
+#if 0
+/*
+ * Q_SETQUOTA - assign an entire dqblk structure.
+ */
+int
+setquota1(struct mount *mp, u_long id, int type, struct dqblk *dqb)
+{
+	struct dquot *dq;
+	struct dquot *ndq;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	
+
+	if ((error = dqget(NULLVP, id, ump, type, &ndq)) != 0)
+		return (error);
+	dq = ndq;
+	mutex_enter(&dq->dq_interlock);
+	/*
+	 * Copy all but the current values.
+	 * Reset time limit if previously had no soft limit or were
+	 * under it, but now have a soft limit and are over it.
+	 */
+	dqb->dqb_curblocks = dq->dq_curblocks;
+	dqb->dqb_curinodes = dq->dq_curinodes;
+	if (dq->dq_id != 0) {
+		dqb->dqb_btime = dq->dq_btime;
+		dqb->dqb_itime = dq->dq_itime;
+	}
+	if (dqb->dqb_bsoftlimit &&
+	    dq->dq_curblocks >= dqb->dqb_bsoftlimit &&
+	    (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit))
+		dqb->dqb_btime = time_second + ump->umq1_btime[type];
+	if (dqb->dqb_isoftlimit &&
+	    dq->dq_curinodes >= dqb->dqb_isoftlimit &&
+	    (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit))
+		dqb->dqb_itime = time_second + ump->umq1_itime[type];
+	dq->dq_un.dq1_dqb = *dqb;
+	if (dq->dq_curblocks < dq->dq_bsoftlimit)
+		dq->dq_flags &= ~DQ_WARN(QL_BLOCK);
+	if (dq->dq_curinodes < dq->dq_isoftlimit)
+		dq->dq_flags &= ~DQ_WARN(QL_FILE);
+	if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
+	    dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
+		dq->dq_flags |= DQ_FAKE;
+	else
+		dq->dq_flags &= ~DQ_FAKE;
+	dq->dq_flags |= DQ_MOD;
+	mutex_exit(&dq->dq_interlock);
+	dqrele(NULLVP, dq);
+	return (0);
+}
+
+/*
+ * Q_SETUSE - set current inode and block usage.
+ */
+int
+setuse(struct mount *mp, u_long id, int type, void *addr)
+{
+	struct dquot *dq;
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct dquot *ndq;
+	struct dqblk usage;
+	int error;
+
+	error = copyin(addr, (void *)&usage, sizeof (struct dqblk));
+	if (error)
+		return (error);
+	if ((error = dqget(NULLVP, id, ump, type, &ndq)) != 0)
+		return (error);
+	dq = ndq;
+	mutex_enter(&dq->dq_interlock);
+	/*
+	 * Reset time limit if have a soft limit and were
+	 * previously under it, but are now over it.
+	 */
+	if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit &&
+	    usage.dqb_curblocks >= dq->dq_bsoftlimit)
+		dq->dq_btime = time_second + ump->umq1_btime[type];
+	if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit &&
+	    usage.dqb_curinodes >= dq->dq_isoftlimit)
+		dq->dq_itime = time_second + ump->umq1_itime[type];
+	dq->dq_curblocks = usage.dqb_curblocks;
+	dq->dq_curinodes = usage.dqb_curinodes;
+	if (dq->dq_curblocks < dq->dq_bsoftlimit)
+		dq->dq_flags &= ~DQ_WARN(QL_BLOCK);
+	if (dq->dq_curinodes < dq->dq_isoftlimit)
+		dq->dq_flags &= ~DQ_WARN(QL_FILE);
+	dq->dq_flags |= DQ_MOD;
+	mutex_exit(&dq->dq_interlock);
+	dqrele(NULLVP, dq);
+	return (0);
+}
+#endif
+
+/*
+ * Q_SYNC - sync quota files to disk.
+ */
+int
+q1sync(struct mount *mp)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct vnode *vp, *mvp;
+	struct dquot *dq;
+	int i, error;
+
+	/*
+	 * Check if the mount point has any quotas.
+	 * If not, simply return.
+	 */
+	for (i = 0; i < MAXQUOTAS; i++)
+		if (ump->um_quotas[i] != NULLVP)
+			break;
+	if (i == MAXQUOTAS)
+		return (0);
+
+	/* Allocate a marker vnode. */
+	mvp = vnalloc(mp);
+
+	/*
+	 * Search vnodes associated with this mount point,
+	 * synchronizing any modified dquot structures.
+	 */
+	mutex_enter(&mntvnode_lock);
+ again:
+	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+		vmark(mvp, vp);
+		mutex_enter(vp->v_interlock);
+		if (VTOI(vp) == NULL || vp->v_mount != mp || vismarker(vp) ||
+		    vp->v_type == VNON ||
+		    (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0) {
+			mutex_exit(vp->v_interlock);
+			continue;
+		}
+		mutex_exit(&mntvnode_lock);
+		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT);
+		if (error) {
+			mutex_enter(&mntvnode_lock);
+			if (error == ENOENT) {
+				(void)vunmark(mvp);
+				goto again;
+			}
+			continue;
+		}
+		for (i = 0; i < MAXQUOTAS; i++) {
+			dq = VTOI(vp)->i_dquot[i];
+			if (dq == NODQUOT)
+				continue;
+			mutex_enter(&dq->dq_interlock);
+			if (dq->dq_flags & DQ_MOD)
+				dq1sync(vp, dq);
+			mutex_exit(&dq->dq_interlock);
+		}
+		vput(vp);
+		mutex_enter(&mntvnode_lock);
+	}
+	mutex_exit(&mntvnode_lock);
+	vnfree(mvp);
+	return (0);
+}
+
+/*
+ * Obtain a dquot structure for the specified identifier and quota file
+ * reading the information from the file if necessary.
+ */
+int
+dq1get(struct vnode *dqvp, u_long id, struct ufsmount *ump, int type,
+    struct dquot *dq)
+{
+	struct iovec aiov;
+	struct uio auio;
+	int error;
+
+	KASSERT(mutex_owned(&dq->dq_interlock));
+	vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	aiov.iov_base = (void *)&dq->dq_un.dq1_dqb;
+	aiov.iov_len = sizeof (struct dqblk);
+	auio.uio_resid = sizeof (struct dqblk);
+	auio.uio_offset = (off_t)(id * sizeof (struct dqblk));
+	auio.uio_rw = UIO_READ;
+	UIO_SETUP_SYSSPACE(&auio);
+	error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]);
+	if (auio.uio_resid == sizeof(struct dqblk) && error == 0)
+		memset((void *)&dq->dq_un.dq1_dqb, 0, sizeof(struct dqblk));
+	VOP_UNLOCK(dqvp);
+	/*
+	 * I/O error in reading quota file, release
+	 * quota structure and reflect problem to caller.
+	 */
+	if (error)
+		return (error);
+	/*
+	 * Check for no limit to enforce.
+	 * Initialize time values if necessary.
+	 */
+	if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
+	    dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
+		dq->dq_flags |= DQ_FAKE;
+	if (dq->dq_id != 0) {
+		if (dq->dq_btime == 0)
+			dq->dq_btime = time_second + ump->umq1_btime[type];
+		if (dq->dq_itime == 0)
+			dq->dq_itime = time_second + ump->umq1_itime[type];
+	}
+	return (0);
+}
+
+/*
+ * Update the disk quota in the quota file.
+ */
+int
+dq1sync(struct vnode *vp, struct dquot *dq)
+{
+	struct vnode *dqvp;
+	struct iovec aiov;
+	struct uio auio;
+	int error;
+
+	if (dq == NODQUOT)
+		panic("dq1sync: dquot");
+	KASSERT(mutex_owned(&dq->dq_interlock));
+	if ((dq->dq_flags & DQ_MOD) == 0)
+		return (0);
+	if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP)
+		panic("dq1sync: file");
+	KASSERT(dqvp != vp);
+	vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	aiov.iov_base = (void *)&dq->dq_un.dq1_dqb;
+	aiov.iov_len = sizeof (struct dqblk);
+	auio.uio_resid = sizeof (struct dqblk);
+	auio.uio_offset = (off_t)(dq->dq_id * sizeof (struct dqblk));
+	auio.uio_rw = UIO_WRITE;
+	UIO_SETUP_SYSSPACE(&auio);
+	error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]);
+	if (auio.uio_resid && error == 0)
+		error = EIO;
+	dq->dq_flags &= ~DQ_MOD;
+	VOP_UNLOCK(dqvp);
+	return (error);
+}
diff --git a/sys/ufs/ufs/ufs_quota2.c b/sys/ufs/ufs/ufs_quota2.c
new file mode 100644
index 000000000..823e398bd
--- /dev/null
+++ b/sys/ufs/ufs/ufs_quota2.c
@@ -0,0 +1,1012 @@
+/* $NetBSD: ufs_quota2.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $ */
+/*-
+  * Copyright (c) 2010 Manuel Bouyer
+  * All rights reserved.
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions
+  * are met:
+  * 1. Redistributions of source code must retain the above copyright
+  *    notice, this list of conditions and the following disclaimer.
+  * 2. Redistributions in binary form must reproduce the above copyright
+  *    notice, this list of conditions and the following disclaimer in the
+  *    documentation and/or other materials provided with the distribution.
+  *
+  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  * POSSIBILITY OF SUCH DAMAGE.
+  */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_quota2.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $");
+
+#include <sys/buf.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/fstrans.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+
+#include <ufs/ufs/quota2.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_quota.h>
+#include <ufs/ufs/ufs_wapbl.h>
+#include <quota/quotaprop.h>
+
+/*
+ * LOCKING:
+ * Data in the entries are protected by the associated struct dquot's
+ * dq_interlock (this means we can't read or change a quota entry without
+ * grabing a dquot for it).
+ * The header and lists (including pointers in the data entries, and q2e_uid)
+ * are protected by the global dqlock.
+ * the locking order is dq_interlock -> dqlock
+ */
+
+static int quota2_bwrite(struct mount *, struct buf *);
+static int getinoquota2(struct inode *, bool, bool, struct buf **,
+    struct quota2_entry **);
+static int getq2h(struct ufsmount *, int, struct buf **,
+    struct quota2_header **, int);
+static int getq2e(struct ufsmount *, int, daddr_t, int, struct buf **,
+    struct quota2_entry **, int);
+static int quota2_walk_list(struct ufsmount *, struct buf *, int,
+    uint64_t *, int, void *,
+    int (*func)(struct ufsmount *, uint64_t *, struct quota2_entry *,
+      uint64_t, void *));
+
+static int quota2_dict_update_q2e_limits(prop_dictionary_t,
+    struct quota2_entry *);
+static prop_dictionary_t q2etoprop(struct quota2_entry *, int);
+
+static const char *limnames[] = INITQLNAMES;
+
+static int
+quota2_dict_update_q2e_limits(prop_dictionary_t data,
+    struct quota2_entry *q2e)
+{
+	const char *val_limitsonly_names[] = INITQVNAMES_LIMITSONLY;
+
+	int i, error;
+	prop_dictionary_t val;
+
+	for (i = 0; i < N_QL; i++) {
+		if (!prop_dictionary_get_dict(data, limnames[i], &val))
+			return EINVAL;
+		error = quotaprop_dict_get_uint64(val,
+		    &q2e->q2e_val[i].q2v_hardlimit,
+		    val_limitsonly_names, N_QV, true);
+		if (error)
+			return error;
+	}
+	return 0;
+}
+static prop_dictionary_t
+q2etoprop(struct quota2_entry *q2e, int def)
+{
+	const char *val_names[] = INITQVNAMES_ALL;
+	prop_dictionary_t dict1 = prop_dictionary_create();
+	prop_dictionary_t dict2;
+	int i;
+
+	if (dict1 == NULL)
+		return NULL;
+
+	if (def) {
+		if (!prop_dictionary_set_cstring_nocopy(dict1, "id",
+		    "default")) {
+			goto err;
+		}
+	} else {
+		if (!prop_dictionary_set_uint32(dict1, "id", q2e->q2e_uid)) {
+			goto err;
+		}
+	}
+	for (i = 0; i < N_QL; i++) {
+		dict2 = limits64toprop(&q2e->q2e_val[i].q2v_hardlimit,
+		    val_names, N_QV);
+		if (dict2 == NULL)
+			goto err;
+		if (!prop_dictionary_set_and_rel(dict1, limnames[i], dict2))
+			goto err;
+	}
+	return dict1;
+
+err:
+	prop_object_release(dict1);
+	return NULL;
+}
+
+
+static int
+quota2_bwrite(struct mount *mp, struct buf *bp)
+{
+	if (mp->mnt_flag & MNT_SYNCHRONOUS)
+		return bwrite(bp);
+	else {
+		bdwrite(bp);
+		return 0;
+	}
+}
+
+static int
+getq2h(struct ufsmount *ump, int type,
+    struct buf **bpp, struct quota2_header **q2hp, int flags)
+{
+#ifdef FFS_EI
+	const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+	int error;
+	struct buf *bp;
+	struct quota2_header *q2h;
+
+	KASSERT(mutex_owned(&dqlock));
+	error = bread(ump->um_quotas[type], 0, ump->umq2_bsize,
+	    ump->um_cred[type], flags, &bp);
+	if (error)
+		return error;
+	if (bp->b_resid != 0) 
+		panic("dq2get: %s quota file truncated", quotatypes[type]);
+
+	q2h = (void *)bp->b_data;
+	if (ufs_rw32(q2h->q2h_magic_number, needswap) != Q2_HEAD_MAGIC ||
+	    q2h->q2h_type != type)
+		panic("dq2get: corrupted %s quota header", quotatypes[type]);
+	*bpp = bp;
+	*q2hp = q2h;
+	return 0;
+}
+
+static int
+getq2e(struct ufsmount *ump, int type, daddr_t lblkno, int blkoffset,
+    struct buf **bpp, struct quota2_entry **q2ep, int flags)
+{
+	int error;
+	struct buf *bp;
+
+	if (blkoffset & (sizeof(uint64_t) - 1)) {
+		panic("dq2get: %s quota file corrupted",
+		    quotatypes[type]);
+	}
+	error = bread(ump->um_quotas[type], lblkno, ump->umq2_bsize,
+	    ump->um_cred[type], flags, &bp);
+	if (error)
+		return error;
+	if (bp->b_resid != 0) {
+		panic("dq2get: %s quota file corrupted",
+		    quotatypes[type]);
+	}
+	*q2ep = (void *)((char *)bp->b_data + blkoffset);
+	*bpp = bp;
+	return 0;
+}
+
+/* walk a quota entry list, calling the callback for each entry */
+#define Q2WL_ABORT 0x10000000
+
+static int
+quota2_walk_list(struct ufsmount *ump, struct buf *hbp, int type,
+    uint64_t *offp, int flags, void *a,
+    int (*func)(struct ufsmount *, uint64_t *, struct quota2_entry *, uint64_t, void *))
+{
+#ifdef FFS_EI
+	const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+	daddr_t off = ufs_rw64(*offp, needswap);
+	struct buf *bp, *obp = hbp;
+	int ret = 0, ret2 = 0;
+	struct quota2_entry *q2e;
+	daddr_t lblkno, blkoff, olblkno = 0;
+
+	KASSERT(mutex_owner(&dqlock));
+
+	while (off != 0) {
+		lblkno = (off >> ump->um_mountp->mnt_fs_bshift);
+		blkoff = (off & ump->umq2_bmask);
+		if (lblkno == 0) {
+			/* in the header block */
+			bp = hbp;
+		} else if (lblkno == olblkno) {
+			/* still in the same buf */
+			bp = obp;
+		} else {
+			ret = bread(ump->um_quotas[type], lblkno, 
+			    ump->umq2_bsize,
+			    ump->um_cred[type], flags, &bp);
+			if (ret)
+				return ret;
+			if (bp->b_resid != 0) {
+				panic("quota2_walk_list: %s quota file corrupted",
+				    quotatypes[type]);
+			}
+		}
+		q2e = (void *)((char *)(bp->b_data) + blkoff);
+		ret = (*func)(ump, offp, q2e, off, a);
+		if (off != ufs_rw64(*offp, needswap)) {
+			/* callback changed parent's pointer, redo */
+			off = ufs_rw64(*offp, needswap);
+			if (bp != hbp && bp != obp)
+				ret2 = bwrite(bp);
+		} else {
+			/* parent if now current */
+			if (obp != bp && obp != hbp) {
+				if (flags & B_MODIFY)
+					ret2 = bwrite(obp);
+				else
+					brelse(obp, 0);
+			}
+			obp = bp;
+			olblkno = lblkno;
+			offp = &(q2e->q2e_next);
+			off = ufs_rw64(*offp, needswap);
+		}
+		if (ret)
+			break;
+		if (ret2) {
+			ret = ret2;
+			break;
+		}
+	}
+	if (obp != hbp) {
+		if (flags & B_MODIFY)
+			ret2 = bwrite(obp);
+		else
+			brelse(obp, 0);
+	}
+	if (ret & Q2WL_ABORT)
+		return 0;
+	if (ret == 0)
+		return ret2;
+	return ret;
+}
+
+int
+quota2_umount(struct mount *mp, int flags)
+{
+	int i, error;
+	struct ufsmount *ump = VFSTOUFS(mp);
+
+	if ((ump->um_flags & UFS_QUOTA2) == 0)
+		return 0;
+
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (ump->um_quotas[i] != NULLVP) {
+			error = vn_close(ump->um_quotas[i], FREAD|FWRITE,
+			    ump->um_cred[i]);
+			if (error) {
+				printf("quota2_umount failed: close(%p) %d\n",
+				    ump->um_quotas[i], error);
+				return error;
+			}
+		}
+		ump->um_quotas[i] = NULLVP;
+	}
+	return 0;
+}
+
+static int 
+quota2_q2ealloc(struct ufsmount *ump, int type, uid_t uid, struct dquot *dq,
+    struct buf **bpp, struct quota2_entry **q2ep)
+{
+	int error, error2;
+	struct buf *hbp, *bp;
+	struct quota2_header *q2h;
+	struct quota2_entry *q2e;
+	daddr_t offset;
+	u_long hash_mask;
+	const int needswap = UFS_MPNEEDSWAP(ump);
+
+	KASSERT(mutex_owned(&dq->dq_interlock));
+	KASSERT(mutex_owned(&dqlock));
+	error = getq2h(ump, type, &hbp, &q2h, B_MODIFY);
+	if (error)
+		return error;
+	offset = ufs_rw64(q2h->q2h_free, needswap);
+	if (offset == 0) {
+		struct vnode *vp = ump->um_quotas[type];
+		struct inode *ip = VTOI(vp);
+		uint64_t size = ip->i_size;
+		/* need to alocate a new disk block */
+		error = UFS_BALLOC(vp, size, ump->umq2_bsize,
+		    ump->um_cred[type], B_CLRBUF | B_SYNC, &bp);
+		if (error) {
+			brelse(hbp, 0);
+			return error;
+		}
+		KASSERT((ip->i_size % ump->umq2_bsize) == 0);
+		ip->i_size += ump->umq2_bsize;
+		DIP_ASSIGN(ip, size, ip->i_size);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		uvm_vnp_setsize(vp, ip->i_size);
+		quota2_addfreeq2e(q2h, bp->b_data, size, ump->umq2_bsize,
+		    needswap);
+		error = bwrite(bp);
+		error2 = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
+		if (error || error2) {
+			brelse(hbp, 0);
+			if (error)
+				return error;
+			return error2;
+		}
+		offset = ufs_rw64(q2h->q2h_free, needswap);
+		KASSERT(offset != 0);
+	}
+	dq->dq2_lblkno = (offset >> ump->um_mountp->mnt_fs_bshift);
+	dq->dq2_blkoff = (offset & ump->umq2_bmask);
+	if (dq->dq2_lblkno == 0) {
+		bp = hbp;
+		q2e = (void *)((char *)bp->b_data + dq->dq2_blkoff);
+	} else {
+		error = getq2e(ump, type, dq->dq2_lblkno,
+		    dq->dq2_blkoff, &bp, &q2e, B_MODIFY);
+		if (error) {
+			brelse(hbp, 0);
+			return error;
+		}
+	}
+	hash_mask = ((1 << q2h->q2h_hash_shift) - 1);
+	/* remove from free list */
+	q2h->q2h_free = q2e->q2e_next;
+
+	memcpy(q2e, &q2h->q2h_defentry, sizeof(*q2e));
+	q2e->q2e_uid = ufs_rw32(uid, needswap);
+	/* insert in hash list */ 
+	q2e->q2e_next = q2h->q2h_entries[uid & hash_mask];
+	q2h->q2h_entries[uid & hash_mask] = ufs_rw64(offset, needswap);
+	if (hbp != bp) {
+		bwrite(hbp);
+	}
+	*q2ep = q2e;
+	*bpp = bp;
+	return 0;
+}
+
+static int
+getinoquota2(struct inode *ip, bool alloc, bool modify, struct buf **bpp,
+    struct quota2_entry **q2ep)
+{
+	int error;
+	int i;
+	struct dquot *dq;
+	struct ufsmount *ump = ip->i_ump;
+	u_int32_t ino_ids[MAXQUOTAS];
+
+	error = getinoquota(ip);
+	if (error)
+		return error;
+
+	if (alloc) {
+		UFS_WAPBL_JLOCK_ASSERT(ump->um_mountp);
+	}
+        ino_ids[USRQUOTA] = ip->i_uid;
+        ino_ids[GRPQUOTA] = ip->i_gid;
+	/* first get the interlock for all dquot */
+	for (i = 0; i < MAXQUOTAS; i++) {
+		dq = ip->i_dquot[i];
+		if (dq == NODQUOT)
+			continue;
+		mutex_enter(&dq->dq_interlock);
+	}
+	/* now get the corresponding quota entry */
+	for (i = 0; i < MAXQUOTAS; i++) {
+		bpp[i] = NULL;
+		q2ep[i] = NULL;
+		dq = ip->i_dquot[i];
+		if (dq == NODQUOT)
+			continue;
+		if (__predict_false(ump->um_quotas[i] == NULL)) {
+			/*
+			 * quotas have been turned off. This can happen
+			 * at umount time.
+			 */
+			mutex_exit(&dq->dq_interlock);
+			dqrele(NULLVP, dq);
+			ip->i_dquot[i] = NULL;
+			continue;
+		}
+
+		if ((dq->dq2_lblkno | dq->dq2_blkoff) == 0) {
+			if (!alloc) {
+				continue;
+			}
+			/* need to alloc a new on-disk quot */
+			mutex_enter(&dqlock);
+			error = quota2_q2ealloc(ump, i, ino_ids[i], dq,
+			    &bpp[i], &q2ep[i]);
+			mutex_exit(&dqlock);
+			if (error)
+				return error;
+		} else {
+			error = getq2e(ump, i, dq->dq2_lblkno,
+			    dq->dq2_blkoff, &bpp[i], &q2ep[i],
+			    modify ? B_MODIFY : 0);
+			if (error)
+				return error;
+		}
+	}
+	return 0;
+}
+
+static int
+quota2_check(struct inode *ip, int vtype, int64_t change, kauth_cred_t cred,
+    int flags)
+{
+	int error;
+	struct buf *bp[MAXQUOTAS];
+	struct quota2_entry *q2e[MAXQUOTAS];
+	struct quota2_val *q2vp;
+	struct dquot *dq;
+	uint64_t ncurblks;
+	struct ufsmount *ump = ip->i_ump;
+	struct mount *mp = ump->um_mountp;
+	const int needswap = UFS_MPNEEDSWAP(ump);
+	int i;
+
+	if ((error = getinoquota2(ip, change > 0, change != 0, bp, q2e)) != 0)
+		return error;
+	if (change == 0) {
+		for (i = 0; i < MAXQUOTAS; i++) {
+			dq = ip->i_dquot[i];
+			if (dq == NODQUOT)
+				continue;
+			if (bp[i])
+				brelse(bp[i], 0);
+			mutex_exit(&dq->dq_interlock);
+		}
+		return 0;
+	}
+	if (change < 0) {
+		for (i = 0; i < MAXQUOTAS; i++) {
+			dq = ip->i_dquot[i];
+			if (dq == NODQUOT)
+				continue;
+			if (q2e[i] == NULL) {
+				mutex_exit(&dq->dq_interlock);
+				continue;
+			}
+			q2vp = &q2e[i]->q2e_val[vtype];
+			ncurblks = ufs_rw64(q2vp->q2v_cur, needswap);
+			if (ncurblks < -change)
+				ncurblks = 0;
+			else
+				ncurblks += change;
+			q2vp->q2v_cur = ufs_rw64(ncurblks, needswap);
+			quota2_bwrite(mp, bp[i]);
+			mutex_exit(&dq->dq_interlock);
+		}
+		return 0;
+	}
+	/* see if the allocation is allowed */
+	for (i = 0; i < MAXQUOTAS; i++) {
+		struct quota2_val q2v;
+		int ql_stat;
+		dq = ip->i_dquot[i];
+		if (dq == NODQUOT)
+			continue;
+		KASSERT(q2e[i] != NULL);
+		quota2_ufs_rwq2v(&q2e[i]->q2e_val[vtype], &q2v, needswap);
+		ql_stat = quota2_check_limit(&q2v, change, time_second);
+
+		if ((flags & FORCE) == 0 &&
+		    kauth_authorize_system(cred, KAUTH_SYSTEM_FS_QUOTA,
+		    KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT,
+		    KAUTH_ARG(i), KAUTH_ARG(vtype), NULL) != 0) {
+			/* enforce this limit */
+			switch(QL_STATUS(ql_stat)) {
+			case QL_S_DENY_HARD:
+				if ((dq->dq_flags & DQ_WARN(vtype)) == 0) {
+					uprintf("\n%s: write failed, %s %s "
+					    "limit reached\n",
+					    mp->mnt_stat.f_mntonname,
+					    quotatypes[i], limnames[vtype]);
+					dq->dq_flags |= DQ_WARN(vtype);
+				}
+				error = EDQUOT;
+				break;
+			case QL_S_DENY_GRACE:
+				if ((dq->dq_flags & DQ_WARN(vtype)) == 0) {
+					uprintf("\n%s: write failed, %s %s "
+					    "limit reached\n",
+					    mp->mnt_stat.f_mntonname,
+					    quotatypes[i], limnames[vtype]);
+					dq->dq_flags |= DQ_WARN(vtype);
+				}
+				error = EDQUOT;
+				break;
+			case QL_S_ALLOW_SOFT:
+				if ((dq->dq_flags & DQ_WARN(vtype)) == 0) {
+					uprintf("\n%s: warning, %s %s "
+					    "quota exceeded\n",
+					    mp->mnt_stat.f_mntonname,
+					    quotatypes[i], limnames[vtype]);
+					dq->dq_flags |= DQ_WARN(vtype);
+				}
+				break;
+			}
+		}
+		/*
+		 * always do this; we don't know if the allocation will
+		 * succed or not in the end. if we don't do the allocation
+		 * q2v_time will be ignored anyway
+		 */
+		if (ql_stat & QL_F_CROSS) {
+			q2v.q2v_time = time_second + q2v.q2v_grace;
+			quota2_ufs_rwq2v(&q2v, &q2e[i]->q2e_val[vtype],
+			    needswap);
+		}
+	}
+
+	/* now do the allocation if allowed */
+	for (i = 0; i < MAXQUOTAS; i++) {
+		dq = ip->i_dquot[i];
+		if (dq == NODQUOT)
+			continue;
+		KASSERT(q2e[i] != NULL);
+		if (error == 0) {
+			q2vp = &q2e[i]->q2e_val[vtype];
+			ncurblks = ufs_rw64(q2vp->q2v_cur, needswap);
+			q2vp->q2v_cur = ufs_rw64(ncurblks + change, needswap);
+			quota2_bwrite(mp, bp[i]);
+		} else
+			brelse(bp[i], 0);
+		mutex_exit(&dq->dq_interlock);
+	}
+	return error;
+}
+
+int
+chkdq2(struct inode *ip, int64_t change, kauth_cred_t cred, int flags)
+{
+	return quota2_check(ip, QL_BLOCK, change, cred, flags);
+}
+
+int
+chkiq2(struct inode *ip, int32_t change, kauth_cred_t cred, int flags)
+{
+	return quota2_check(ip, QL_FILE, change, cred, flags);
+}
+
+int
+quota2_handle_cmd_set(struct ufsmount *ump, int type, int id,
+    int defaultq, prop_dictionary_t data)
+{
+	int error;
+	struct dquot *dq;
+	struct quota2_header *q2h;
+	struct quota2_entry q2e, *q2ep;
+	struct buf *bp;
+	const int needswap = UFS_MPNEEDSWAP(ump);
+
+	if (ump->um_quotas[type] == NULLVP)
+		return ENODEV;
+	error = UFS_WAPBL_BEGIN(ump->um_mountp);
+	if (error)
+		return error;
+	
+	if (defaultq) {
+		mutex_enter(&dqlock);
+		error = getq2h(ump, type, &bp, &q2h, B_MODIFY);
+		if (error) {
+			mutex_exit(&dqlock);
+			goto out_wapbl;
+		}
+		quota2_ufs_rwq2e(&q2h->q2h_defentry, &q2e, needswap);
+		error = quota2_dict_update_q2e_limits(data, &q2e);
+		if (error) {
+			mutex_exit(&dqlock);
+			brelse(bp, 0);
+			goto out_wapbl;
+		}
+		quota2_ufs_rwq2e(&q2e, &q2h->q2h_defentry, needswap);
+		mutex_exit(&dqlock);
+		quota2_bwrite(ump->um_mountp, bp);
+		goto out_wapbl;
+	}
+
+	error = dqget(NULLVP, id, ump, type, &dq);
+	if (error)
+		goto out_wapbl;
+
+	mutex_enter(&dq->dq_interlock);
+	if (dq->dq2_lblkno == 0 && dq->dq2_blkoff == 0) {
+		/* need to alloc a new on-disk quot */
+		mutex_enter(&dqlock);
+		error = quota2_q2ealloc(ump, type, id, dq, &bp, &q2ep);
+		mutex_exit(&dqlock);
+	} else {
+		error = getq2e(ump, type, dq->dq2_lblkno, dq->dq2_blkoff,
+		    &bp, &q2ep, B_MODIFY);
+	}
+	if (error)
+		goto out_il;
+	
+	quota2_ufs_rwq2e(q2ep, &q2e, needswap);
+	error = quota2_dict_update_q2e_limits(data, &q2e);
+	if (error) {
+		brelse(bp, 0);
+		goto out_il;
+	}
+	quota2_ufs_rwq2e(&q2e, q2ep, needswap);
+	quota2_bwrite(ump->um_mountp, bp);
+
+out_il:
+	mutex_exit(&dq->dq_interlock);
+	dqrele(NULLVP, dq);
+out_wapbl:
+	UFS_WAPBL_END(ump->um_mountp);
+	return error;
+}
+
+struct dq2clear_callback {
+	uid_t id;
+	struct dquot *dq;
+	struct quota2_header *q2h;
+};
+
+static int
+dq2clear_callback(struct ufsmount *ump, uint64_t *offp, struct quota2_entry *q2e,
+    uint64_t off, void *v)
+{
+	struct dq2clear_callback *c = v;
+#ifdef FFS_EI
+	const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+	uint64_t myoff;
+
+	if (ufs_rw32(q2e->q2e_uid, needswap) == c->id) {
+		KASSERT(mutex_owned(&c->dq->dq_interlock));
+		c->dq->dq2_lblkno = 0;
+		c->dq->dq2_blkoff = 0;
+		myoff = *offp;
+		/* remove from hash list */
+		*offp = q2e->q2e_next;
+		/* add to free list */
+		q2e->q2e_next = c->q2h->q2h_free;
+		c->q2h->q2h_free = myoff;
+		return Q2WL_ABORT;
+	}
+	return 0;
+}
+int
+quota2_handle_cmd_clear(struct ufsmount *ump, int type, int id,
+    int defaultq, prop_dictionary_t data)
+{
+	int error, i;
+	struct dquot *dq;
+	struct quota2_header *q2h;
+	struct quota2_entry q2e, *q2ep;
+	struct buf *hbp, *bp;
+	u_long hash_mask;
+	struct dq2clear_callback c;
+
+	if (ump->um_quotas[type] == NULLVP)
+		return ENODEV;
+	if (defaultq)
+		return EOPNOTSUPP;
+
+	/* get the default entry before locking the entry's buffer */
+	mutex_enter(&dqlock);
+	error = getq2h(ump, type, &hbp, &q2h, 0);
+	if (error) {
+		mutex_exit(&dqlock);
+		return error;
+	}
+	/* we'll copy to another disk entry, so no need to swap */
+	memcpy(&q2e, &q2h->q2h_defentry, sizeof(q2e));
+	mutex_exit(&dqlock);
+	brelse(hbp, 0);
+
+	error = dqget(NULLVP, id, ump, type, &dq);
+	if (error)
+		return error;
+
+	mutex_enter(&dq->dq_interlock);
+	if (dq->dq2_lblkno == 0 && dq->dq2_blkoff == 0) {
+		/* already clear, nothing to do */
+		error = ENOENT;
+		goto out_il;
+	}
+	error = UFS_WAPBL_BEGIN(ump->um_mountp);
+	if (error)
+		goto out_dq;
+	
+	error = getq2e(ump, type, dq->dq2_lblkno, dq->dq2_blkoff,
+	    &bp, &q2ep, B_MODIFY);
+	if (error)
+		goto out_wapbl;
+
+	if (q2ep->q2e_val[QL_BLOCK].q2v_cur != 0 ||
+	    q2ep->q2e_val[QL_FILE].q2v_cur != 0) {
+		/* can't free this entry; revert to default */
+		for (i = 0; i < N_QL; i++) {
+			q2ep->q2e_val[i].q2v_softlimit =
+			    q2e.q2e_val[i].q2v_softlimit;
+			q2ep->q2e_val[i].q2v_hardlimit =
+			    q2e.q2e_val[i].q2v_hardlimit;
+			q2ep->q2e_val[i].q2v_grace =
+			    q2e.q2e_val[i].q2v_grace;
+			q2ep->q2e_val[i].q2v_time = 0;
+		}
+		quota2_bwrite(ump->um_mountp, bp);
+		goto out_wapbl;
+	}
+	/* we can free it. release bp so we can walk the list */
+	brelse(bp, 0);
+	mutex_enter(&dqlock);
+	error = getq2h(ump, type, &hbp, &q2h, 0);
+	if (error)
+		goto out_dqlock;
+
+	hash_mask = ((1 << q2h->q2h_hash_shift) - 1);
+	c.dq = dq;
+	c.id = id;
+	c.q2h = q2h;
+	error = quota2_walk_list(ump, hbp, type,
+	    &q2h->q2h_entries[id & hash_mask], B_MODIFY, &c,
+	    dq2clear_callback);
+
+	bwrite(hbp);
+
+out_dqlock:
+	mutex_exit(&dqlock);
+out_wapbl:
+	UFS_WAPBL_END(ump->um_mountp);
+out_il:
+	mutex_exit(&dq->dq_interlock);
+out_dq:
+	dqrele(NULLVP, dq);
+	return error;
+}
+
+static int
+quota2_array_add_q2e(struct ufsmount *ump, int type,
+    int id, prop_array_t replies)
+{
+	struct dquot *dq;
+	int error;
+	struct quota2_entry *q2ep, q2e;
+	struct buf  *bp;
+	const int needswap = UFS_MPNEEDSWAP(ump);
+	prop_dictionary_t dict;
+
+	error = dqget(NULLVP, id, ump, type, &dq);
+	if (error)
+		return error;
+
+	mutex_enter(&dq->dq_interlock);
+	if (dq->dq2_lblkno == 0 && dq->dq2_blkoff == 0) {
+		mutex_exit(&dq->dq_interlock);
+		dqrele(NULLVP, dq);
+		return ENOENT;
+	}
+	error = getq2e(ump, type, dq->dq2_lblkno, dq->dq2_blkoff,
+	    &bp, &q2ep, 0);
+	if (error) {
+		mutex_exit(&dq->dq_interlock);
+		dqrele(NULLVP, dq);
+		return error;
+	}
+	quota2_ufs_rwq2e(q2ep, &q2e, needswap);
+	brelse(bp, 0);
+	mutex_exit(&dq->dq_interlock);
+	dqrele(NULLVP, dq);
+	dict = q2etoprop(&q2e, 0);
+	if (dict == NULL)
+		return ENOMEM;
+	if (!prop_array_add_and_rel(replies, dict))
+		return ENOMEM;
+	return 0;
+}
+
+int
+quota2_handle_cmd_get(struct ufsmount *ump, int type, int id,
+    int defaultq, prop_array_t replies)
+{
+	int error;
+	struct quota2_header *q2h;
+	struct quota2_entry q2e;
+	struct buf *bp;
+	prop_dictionary_t dict;
+	const int needswap = UFS_MPNEEDSWAP(ump);
+
+	if (ump->um_quotas[type] == NULLVP)
+		return ENODEV;
+	if (defaultq) {
+		mutex_enter(&dqlock);
+		error = getq2h(ump, type, &bp, &q2h, 0);
+		if (error) {
+			mutex_exit(&dqlock);
+			return error;
+		}
+		quota2_ufs_rwq2e(&q2h->q2h_defentry, &q2e, needswap);
+		mutex_exit(&dqlock);
+		brelse(bp, 0);
+		dict = q2etoprop(&q2e, defaultq);
+		if (dict == NULL)
+			return ENOMEM;
+		if (!prop_array_add_and_rel(replies, dict))
+			return ENOMEM;
+	} else
+		error = quota2_array_add_q2e(ump, type, id, replies);
+	
+	return error;
+}
+
+struct getuids {
+	long nuids; /* number of uids in array */
+	long size;  /* size of array */
+	uid_t *uids; /* array of uids, dynamically allocated */
+};
+
+static int
+quota2_getuids_callback(struct ufsmount *ump, uint64_t *offp,
+    struct quota2_entry *q2ep, uint64_t off, void *v)
+{
+	struct getuids *gu = v;
+	uid_t *newuids;
+#ifdef FFS_EI
+	const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+
+	if (gu->nuids == gu->size) {
+		newuids = realloc(gu->uids, gu->size + PAGE_SIZE, M_TEMP,
+		    M_WAITOK);
+		if (newuids == NULL) {
+			free(gu->uids, M_TEMP);
+			return ENOMEM;
+		}
+		gu->uids = newuids;
+		gu->size += (PAGE_SIZE / sizeof(uid_t));
+	}
+	gu->uids[gu->nuids] = ufs_rw32(q2ep->q2e_uid, needswap);
+	gu->nuids++;
+	return 0;
+}
+
+int
+quota2_handle_cmd_getall(struct ufsmount *ump, int type, prop_array_t replies)
+{
+	int error;
+	struct quota2_header *q2h;
+	struct quota2_entry  q2e;
+	struct buf *hbp;
+	prop_dictionary_t dict;
+	uint64_t offset;
+	int i, j;
+	int quota2_hash_size;
+	const int needswap = UFS_MPNEEDSWAP(ump);
+	struct getuids gu;
+
+	if (ump->um_quotas[type] == NULLVP)
+		return ENODEV;
+	mutex_enter(&dqlock);
+	error = getq2h(ump, type, &hbp, &q2h, 0);
+	if (error) {
+		mutex_exit(&dqlock);
+		return error;
+	}
+	quota2_ufs_rwq2e(&q2h->q2h_defentry, &q2e, needswap);
+	dict = q2etoprop(&q2e, 1);
+	if (!prop_array_add_and_rel(replies, dict)) {
+		error = ENOMEM;
+		goto error_bp;
+	}
+	/*
+	 * we can't directly get entries as we can't walk the list
+	 * with qdlock and grab dq_interlock to read the entries
+	 * at the same time. So just walk the lists to build a list of uid,
+	 * and then read entries for these uids
+	 */
+	memset(&gu, 0, sizeof(gu));
+	quota2_hash_size = ufs_rw16(q2h->q2h_hash_size, needswap);
+	for (i = 0; i < quota2_hash_size ; i++) {
+		offset = q2h->q2h_entries[i];
+		error = quota2_walk_list(ump, hbp, type, &offset, 0, &gu,
+		    quota2_getuids_callback);
+		if (error) {
+			if (gu.uids != NULL)
+				free(gu.uids, M_TEMP);
+			break;
+		}
+	}
+error_bp:
+	mutex_exit(&dqlock);
+	brelse(hbp, 0);
+	if (error)
+		return error;
+	for (j = 0; j < gu.nuids; j++) {
+		error = quota2_array_add_q2e(ump, type,
+		    gu.uids[j], replies);
+		if (error && error != ENOENT)
+			break;
+	}
+	free(gu.uids, M_TEMP);
+	return error;
+}
+
+int
+q2sync(struct mount *mp)
+{
+	return 0;
+}
+
+struct dq2get_callback {
+	uid_t id;
+	struct dquot *dq;
+};
+
+static int
+dq2get_callback(struct ufsmount *ump, uint64_t *offp, struct quota2_entry *q2e,
+    uint64_t off, void *v)
+{
+	struct dq2get_callback *c = v;
+	daddr_t lblkno;
+	int blkoff;
+#ifdef FFS_EI
+	const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+
+	if (ufs_rw32(q2e->q2e_uid, needswap) == c->id) {
+		KASSERT(mutex_owned(&c->dq->dq_interlock));
+		lblkno = (off >> ump->um_mountp->mnt_fs_bshift);
+		blkoff = (off & ump->umq2_bmask);
+		c->dq->dq2_lblkno = lblkno;
+		c->dq->dq2_blkoff = blkoff;
+		return Q2WL_ABORT;
+	}
+	return 0;
+}
+
+int
+dq2get(struct vnode *dqvp, u_long id, struct ufsmount *ump, int type,
+    struct dquot *dq)
+{
+	struct buf *bp;
+	struct quota2_header *q2h;
+	int error;
+	daddr_t offset;
+	u_long hash_mask;
+	struct dq2get_callback c = {
+		.id = id,
+		.dq = dq
+	};
+
+	KASSERT(mutex_owned(&dq->dq_interlock));
+	mutex_enter(&dqlock);
+	error = getq2h(ump, type, &bp, &q2h, 0);
+	if (error)
+		goto out_mutex;
+	/* look for our entry */
+	hash_mask = ((1 << q2h->q2h_hash_shift) - 1);
+	offset = q2h->q2h_entries[id & hash_mask];
+	error = quota2_walk_list(ump, bp, type, &offset, 0, (void *)&c,
+	    dq2get_callback);
+	brelse(bp, 0);
+out_mutex:
+	mutex_exit(&dqlock);
+	return error;
+}
+
+int
+dq2sync(struct vnode *vp, struct dquot *dq)
+{
+	return 0;
+}
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
new file mode 100644
index 000000000..4ab40c8c9
--- /dev/null
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -0,0 +1,533 @@
+/*	$NetBSD: ufs_readwrite.c,v 1.100 2011/11/18 21:18:52 christos Exp $	*/
+
+/*-
+ * Copyright (c) 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.100 2011/11/18 21:18:52 christos Exp $");
+
+#ifdef LFS_READWRITE
+#define	FS			struct lfs
+#define	I_FS			i_lfs
+#define	READ			lfs_read
+#define	READ_S			"lfs_read"
+#define	WRITE			lfs_write
+#define	WRITE_S			"lfs_write"
+#define	fs_bsize		lfs_bsize
+#define	fs_bmask		lfs_bmask
+#define	UFS_WAPBL_BEGIN(mp)	0
+#define	UFS_WAPBL_END(mp)	do { } while (0)
+#define	UFS_WAPBL_UPDATE(vp, access, modify, flags)	do { } while (0)
+#else
+#define	FS			struct fs
+#define	I_FS			i_fs
+#define	READ			ffs_read
+#define	READ_S			"ffs_read"
+#define	WRITE			ffs_write
+#define	WRITE_S			"ffs_write"
+#endif
+
+/*
+ * Vnode op for reading.
+ */
+/* ARGSUSED */
+int
+READ(void *v)
+{
+	struct vop_read_args /* {
+		struct vnode *a_vp;
+		struct uio *a_uio;
+		int a_ioflag;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+	struct vnode *vp;
+	struct inode *ip;
+	struct uio *uio;
+	struct ufsmount *ump;
+	struct buf *bp;
+	FS *fs;
+	vsize_t bytelen;
+	daddr_t lbn, nextlbn;
+	off_t bytesinfile;
+	long size, xfersize, blkoffset;
+	int error, ioflag;
+	bool usepc = false;
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	ump = ip->i_ump;
+	uio = ap->a_uio;
+	ioflag = ap->a_ioflag;
+	error = 0;
+
+#ifdef DIAGNOSTIC
+	if (uio->uio_rw != UIO_READ)
+		panic("%s: mode", READ_S);
+
+	if (vp->v_type == VLNK) {
+		if (ip->i_size < ump->um_maxsymlinklen ||
+		    (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0))
+			panic("%s: short symlink", READ_S);
+	} else if (vp->v_type != VREG && vp->v_type != VDIR)
+		panic("%s: type %d", READ_S, vp->v_type);
+#endif
+	fs = ip->I_FS;
+	if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize)
+		return (EFBIG);
+	if (uio->uio_resid == 0)
+		return (0);
+
+#ifndef LFS_READWRITE
+	if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT)
+		return ffs_snapshot_read(vp, uio, ioflag);
+#endif /* !LFS_READWRITE */
+
+	fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+	if (uio->uio_offset >= ip->i_size)
+		goto out;
+
+#ifdef LFS_READWRITE
+	usepc = (vp->v_type == VREG && ip->i_number != LFS_IFILE_INUM);
+#else /* !LFS_READWRITE */
+	usepc = vp->v_type == VREG;
+#endif /* !LFS_READWRITE */
+	if (usepc) {
+		const int advice = IO_ADV_DECODE(ap->a_ioflag);
+
+		while (uio->uio_resid > 0) {
+			if (ioflag & IO_DIRECT) {
+				genfs_directio(vp, uio, ioflag);
+			}
+			bytelen = MIN(ip->i_size - uio->uio_offset,
+			    uio->uio_resid);
+			if (bytelen == 0)
+				break;
+			error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
+			    UBC_READ | UBC_PARTIALOK | UBC_UNMAP_FLAG(vp));
+			if (error)
+				break;
+		}
+		goto out;
+	}
+
+	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
+		bytesinfile = ip->i_size - uio->uio_offset;
+		if (bytesinfile <= 0)
+			break;
+		lbn = lblkno(fs, uio->uio_offset);
+		nextlbn = lbn + 1;
+		size = blksize(fs, ip, lbn);
+		blkoffset = blkoff(fs, uio->uio_offset);
+		xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
+		    bytesinfile);
+
+		if (lblktosize(fs, nextlbn) >= ip->i_size)
+			error = bread(vp, lbn, size, NOCRED, 0, &bp);
+		else {
+			int nextsize = blksize(fs, ip, nextlbn);
+			error = breadn(vp, lbn,
+			    size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
+		}
+		if (error)
+			break;
+
+		/*
+		 * We should only get non-zero b_resid when an I/O error
+		 * has occurred, which should cause us to break above.
+		 * However, if the short read did not cause an error,
+		 * then we want to ensure that we do not uiomove bad
+		 * or uninitialized data.
+		 */
+		size -= bp->b_resid;
+		if (size < xfersize) {
+			if (size == 0)
+				break;
+			xfersize = size;
+		}
+		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+		if (error)
+			break;
+		brelse(bp, 0);
+	}
+	if (bp != NULL)
+		brelse(bp, 0);
+
+ out:
+	if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
+		ip->i_flag |= IN_ACCESS;
+		if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) {
+			error = UFS_WAPBL_BEGIN(vp->v_mount);
+			if (error) {
+				fstrans_done(vp->v_mount);
+				return error;
+			}
+			error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
+			UFS_WAPBL_END(vp->v_mount);
+		}
+	}
+
+	fstrans_done(vp->v_mount);
+	return (error);
+}
+
+/*
+ * Vnode op for writing.
+ */
+int
+WRITE(void *v)
+{
+	struct vop_write_args /* {
+		struct vnode *a_vp;
+		struct uio *a_uio;
+		int a_ioflag;
+		kauth_cred_t a_cred;
+	} */ *ap = v;
+	struct vnode *vp;
+	struct uio *uio;
+	struct inode *ip;
+	FS *fs;
+	struct buf *bp;
+	kauth_cred_t cred;
+	daddr_t lbn;
+	off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
+	int blkoffset, error, flags, ioflag, resid, size, xfersize;
+	int aflag;
+	int extended=0;
+	vsize_t bytelen;
+	bool async;
+	bool usepc = false;
+#ifdef LFS_READWRITE
+	bool need_unreserve = false;
+#endif
+	struct ufsmount *ump;
+
+	cred = ap->a_cred;
+	ioflag = ap->a_ioflag;
+	uio = ap->a_uio;
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	ump = ip->i_ump;
+
+	KASSERT(vp->v_size == ip->i_size);
+#ifdef DIAGNOSTIC
+	if (uio->uio_rw != UIO_WRITE)
+		panic("%s: mode", WRITE_S);
+#endif
+
+	switch (vp->v_type) {
+	case VREG:
+		if (ioflag & IO_APPEND)
+			uio->uio_offset = ip->i_size;
+		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
+			return (EPERM);
+		/* FALLTHROUGH */
+	case VLNK:
+		break;
+	case VDIR:
+		if ((ioflag & IO_SYNC) == 0)
+			panic("%s: nonsync dir write", WRITE_S);
+		break;
+	default:
+		panic("%s: type", WRITE_S);
+	}
+
+	fs = ip->I_FS;
+	if (uio->uio_offset < 0 ||
+	    (u_int64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize)
+		return (EFBIG);
+#ifdef LFS_READWRITE
+	/* Disallow writes to the Ifile, even if noschg flag is removed */
+	/* XXX can this go away when the Ifile is no longer in the namespace? */
+	if (vp == fs->lfs_ivnode)
+		return (EPERM);
+#endif
+	if (uio->uio_resid == 0)
+		return (0);
+
+	fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+	flags = ioflag & IO_SYNC ? B_SYNC : 0;
+	async = vp->v_mount->mnt_flag & MNT_ASYNC;
+	origoff = uio->uio_offset;
+	resid = uio->uio_resid;
+	osize = ip->i_size;
+	error = 0;
+
+	usepc = vp->v_type == VREG;
+
+	if ((ioflag & IO_JOURNALLOCKED) == 0) {
+		error = UFS_WAPBL_BEGIN(vp->v_mount);
+		if (error) {
+			fstrans_done(vp->v_mount);
+			return error;
+		}
+	}
+
+#ifdef LFS_READWRITE
+	async = true;
+	lfs_check(vp, LFS_UNUSED_LBN, 0);
+#endif /* !LFS_READWRITE */
+	if (!usepc)
+		goto bcache;
+
+	preallocoff = round_page(blkroundup(fs, MAX(osize, uio->uio_offset)));
+	aflag = ioflag & IO_SYNC ? B_SYNC : 0;
+	nsize = MAX(osize, uio->uio_offset + uio->uio_resid);
+	endallocoff = nsize - blkoff(fs, nsize);
+
+	/*
+	 * if we're increasing the file size, deal with expanding
+	 * the fragment if there is one.
+	 */
+
+	if (nsize > osize && lblkno(fs, osize) < NDADDR &&
+	    lblkno(fs, osize) != lblkno(fs, nsize) &&
+	    blkroundup(fs, osize) != osize) {
+		off_t eob;
+
+		eob = blkroundup(fs, osize);
+		uvm_vnp_setwritesize(vp, eob);
+		error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag);
+		if (error)
+			goto out;
+		if (flags & B_SYNC) {
+			mutex_enter(vp->v_interlock);
+			VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask),
+			    round_page(eob),
+			    PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
+		}
+	}
+
+	while (uio->uio_resid > 0) {
+		int ubc_flags = UBC_WRITE;
+		bool overwrite; /* if we're overwrite a whole block */
+		off_t newoff;
+
+		if (ioflag & IO_DIRECT) {
+			genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED);
+		}
+
+		oldoff = uio->uio_offset;
+		blkoffset = blkoff(fs, uio->uio_offset);
+		bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
+		if (bytelen == 0) {
+			break;
+		}
+
+		/*
+		 * if we're filling in a hole, allocate the blocks now and
+		 * initialize the pages first.  if we're extending the file,
+		 * we can safely allocate blocks without initializing pages
+		 * since the new blocks will be inaccessible until the write
+		 * is complete.
+		 */
+		overwrite = uio->uio_offset >= preallocoff &&
+		    uio->uio_offset < endallocoff;
+		if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 &&
+		    blkoff(fs, uio->uio_offset) == 0 &&
+		    (uio->uio_offset & PAGE_MASK) == 0) {
+			vsize_t len;
+
+			len = trunc_page(bytelen);
+			len -= blkoff(fs, len);
+			if (len > 0) {
+				overwrite = true;
+				bytelen = len;
+			}
+		}
+
+		newoff = oldoff + bytelen;
+		if (vp->v_size < newoff) {
+			uvm_vnp_setwritesize(vp, newoff);
+		}
+
+		if (!overwrite) {
+			error = ufs_balloc_range(vp, uio->uio_offset, bytelen,
+			    cred, aflag);
+			if (error)
+				break;
+		} else {
+			genfs_node_wrlock(vp);
+			error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
+			    aflag, cred);
+			genfs_node_unlock(vp);
+			if (error)
+				break;
+			ubc_flags |= UBC_FAULTBUSY;
+		}
+
+		/*
+		 * copy the data.
+		 */
+
+		error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
+		    IO_ADV_DECODE(ioflag), ubc_flags | UBC_UNMAP_FLAG(vp));
+
+		/*
+		 * update UVM's notion of the size now that we've
+		 * copied the data into the vnode's pages.
+		 *
+		 * we should update the size even when uiomove failed.
+		 */
+
+		if (vp->v_size < newoff) {
+			uvm_vnp_setsize(vp, newoff);
+			extended = 1;
+		}
+
+		if (error)
+			break;
+
+		/*
+		 * flush what we just wrote if necessary.
+		 * XXXUBC simplistic async flushing.
+		 */
+
+#ifndef LFS_READWRITE
+		if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
+			mutex_enter(vp->v_interlock);
+			error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
+			    (uio->uio_offset >> 16) << 16,
+			    PGO_CLEANIT | PGO_JOURNALLOCKED);
+			if (error)
+				break;
+		}
+#endif
+	}
+	if (error == 0 && ioflag & IO_SYNC) {
+		mutex_enter(vp->v_interlock);
+		error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask),
+		    round_page(blkroundup(fs, uio->uio_offset)),
+		    PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
+	}
+	goto out;
+
+ bcache:
+	mutex_enter(vp->v_interlock);
+	VOP_PUTPAGES(vp, trunc_page(origoff), round_page(origoff + resid),
+	    PGO_CLEANIT | PGO_FREE | PGO_SYNCIO | PGO_JOURNALLOCKED);
+	while (uio->uio_resid > 0) {
+		lbn = lblkno(fs, uio->uio_offset);
+		blkoffset = blkoff(fs, uio->uio_offset);
+		xfersize = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
+		if (fs->fs_bsize > xfersize)
+			flags |= B_CLRBUF;
+		else
+			flags &= ~B_CLRBUF;
+
+#ifdef LFS_READWRITE
+		error = lfs_reserve(fs, vp, NULL,
+		    btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
+		if (error)
+			break;
+		need_unreserve = true;
+#endif
+		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
+		    ap->a_cred, flags, &bp);
+
+		if (error)
+			break;
+		if (uio->uio_offset + xfersize > ip->i_size) {
+			ip->i_size = uio->uio_offset + xfersize;
+			DIP_ASSIGN(ip, size, ip->i_size);
+			uvm_vnp_setsize(vp, ip->i_size);
+			extended = 1;
+		}
+		size = blksize(fs, ip, lbn) - bp->b_resid;
+		if (xfersize > size)
+			xfersize = size;
+
+		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+
+		/*
+		 * if we didn't clear the block and the uiomove failed,
+		 * the buf will now contain part of some other file,
+		 * so we need to invalidate it.
+		 */
+		if (error && (flags & B_CLRBUF) == 0) {
+			brelse(bp, BC_INVAL);
+			break;
+		}
+#ifdef LFS_READWRITE
+		(void)VOP_BWRITE(bp->b_vp, bp);
+		lfs_reserve(fs, vp, NULL,
+		    -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
+		need_unreserve = false;
+#else
+		if (ioflag & IO_SYNC)
+			(void)bwrite(bp);
+		else if (xfersize + blkoffset == fs->fs_bsize)
+			bawrite(bp);
+		else
+			bdwrite(bp);
+#endif
+		if (error || xfersize == 0)
+			break;
+	}
+#ifdef LFS_READWRITE
+	if (need_unreserve) {
+		lfs_reserve(fs, vp, NULL,
+		    -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
+	}
+#endif
+
+	/*
+	 * If we successfully wrote any data, and we are not the superuser
+	 * we clear the setuid and setgid bits as a precaution against
+	 * tampering.
+	 */
+out:
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	if (vp->v_mount->mnt_flag & MNT_RELATIME)
+		ip->i_flag |= IN_ACCESS;
+	if (resid > uio->uio_resid && ap->a_cred &&
+	    kauth_authorize_generic(ap->a_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
+		ip->i_mode &= ~(ISUID | ISGID);
+		DIP_ASSIGN(ip, mode, ip->i_mode);
+	}
+	if (resid > uio->uio_resid)
+		VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
+	if (error) {
+		(void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred);
+		uio->uio_offset -= resid - uio->uio_resid;
+		uio->uio_resid = resid;
+	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
+		error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
+	else
+		UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+	KASSERT(vp->v_size == ip->i_size);
+	if ((ioflag & IO_JOURNALLOCKED) == 0)
+		UFS_WAPBL_END(vp->v_mount);
+	fstrans_done(vp->v_mount);
+
+	return (error);
+}
diff --git a/sys/ufs/ufs/ufs_vfsops.c b/sys/ufs/ufs/ufs_vfsops.c
new file mode 100644
index 000000000..ac7230bca
--- /dev/null
+++ b/sys/ufs/ufs/ufs_vfsops.c
@@ -0,0 +1,308 @@
+/*	$NetBSD: ufs_vfsops.c,v 1.42 2011/03/24 17:05:46 bouyer Exp $	*/
+
+/*
+ * Copyright (c) 1991, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_vfsops.c	8.8 (Berkeley) 5/20/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_vfsops.c,v 1.42 2011/03/24 17:05:46 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/mbuf.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/kauth.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#ifdef UFS_DIRHASH
+#include <ufs/ufs/dirhash.h>
+#endif
+#include <quota/quotaprop.h>
+
+/* how many times ufs_init() was called */
+static int ufs_initcount = 0;
+
+pool_cache_t ufs_direct_cache;
+
+/*
+ * Make a filesystem operational.
+ * Nothing to do at the moment.
+ */
+/* ARGSUSED */
+int
+ufs_start(struct mount *mp, int flags)
+{
+
+	return (0);
+}
+
+/*
+ * Return the root of a filesystem.
+ */
+int
+ufs_root(struct mount *mp, struct vnode **vpp)
+{
+	struct vnode *nvp;
+	int error;
+
+	if ((error = VFS_VGET(mp, (ino_t)ROOTINO, &nvp)) != 0)
+		return (error);
+	*vpp = nvp;
+	return (0);
+}
+
+/*
+ * Do operations associated with quotas
+ */
+int
+ufs_quotactl(struct mount *mp, prop_dictionary_t dict)
+{
+	struct lwp *l = curlwp;
+
+#if !defined(QUOTA) && !defined(QUOTA2)
+	(void) mp;
+	(void) dict;
+	(void) l;
+	return (EOPNOTSUPP);
+#else
+	int  error;
+	prop_dictionary_t cmddict;
+	prop_array_t commands;
+	prop_object_iterator_t iter;
+
+	/* Mark the mount busy, as we're passing it to kauth(9). */
+	error = vfs_busy(mp, NULL);
+	if (error)
+		return (error);
+
+	error = quota_get_cmds(dict, &commands);
+	if (error)
+		goto out_vfs;
+	iter = prop_array_iterator(commands);
+	if (iter == NULL) {
+		error = ENOMEM;
+		goto out_vfs;
+	}
+		
+		
+	mutex_enter(&mp->mnt_updating);
+	while ((cmddict = prop_object_iterator_next(iter)) != NULL) {
+		if (prop_object_type(cmddict) != PROP_TYPE_DICTIONARY)
+			continue;
+		error = quota_handle_cmd(mp, l, cmddict);
+		if (error)
+			break;
+	}
+	prop_object_iterator_release(iter);
+	mutex_exit(&mp->mnt_updating);
+out_vfs:
+	vfs_unbusy(mp, false, NULL);
+	return (error);
+#endif
+}
+	
+#if 0
+	switch (cmd) {
+	case Q_SYNC:
+		break;
+
+	case Q_GETQUOTA:
+		/* The user can always query about his own quota. */
+		if (uid == kauth_cred_getuid(l->l_cred))
+			break;
+
+		error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+		    KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, KAUTH_ARG(uid), NULL);
+
+		break;
+
+	case Q_QUOTAON:
+	case Q_QUOTAOFF:
+		error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+		    KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL);
+
+		break;
+
+	case Q_SETQUOTA:
+	case Q_SETUSE:
+		error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+		    KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(uid), NULL);
+
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	type = cmds & SUBCMDMASK;
+	if (!error) {
+		/* Only check if there was no error above. */
+		if ((u_int)type >= MAXQUOTAS)
+			error = EINVAL;
+	}
+
+	if (error) {
+		vfs_unbusy(mp, false, NULL);
+		return (error);
+	}
+
+	mutex_enter(&mp->mnt_updating);
+	switch (cmd) {
+
+	case Q_QUOTAON:
+		error = quotaon(l, mp, type, arg);
+		break;
+
+	case Q_QUOTAOFF:
+		error = quotaoff(l, mp, type);
+		break;
+
+	case Q_SETQUOTA:
+		error = setquota(mp, uid, type, arg);
+		break;
+
+	case Q_SETUSE:
+		error = setuse(mp, uid, type, arg);
+		break;
+
+	case Q_GETQUOTA:
+		error = getquota(mp, uid, type, arg);
+		break;
+
+	case Q_SYNC:
+		error = qsync(mp);
+		break;
+
+	default:
+		error = EINVAL;
+	}
+	mutex_exit(&mp->mnt_updating);
+	vfs_unbusy(mp, false, NULL);
+	return (error);
+#endif
+
+/*
+ * This is the generic part of fhtovp called after the underlying
+ * filesystem has validated the file handle.
+ */
+int
+ufs_fhtovp(struct mount *mp, struct ufid *ufhp, struct vnode **vpp)
+{
+	struct vnode *nvp;
+	struct inode *ip;
+	int error;
+
+	if ((error = VFS_VGET(mp, ufhp->ufid_ino, &nvp)) != 0) {
+		*vpp = NULLVP;
+		return (error);
+	}
+	ip = VTOI(nvp);
+	if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen) {
+		vput(nvp);
+		*vpp = NULLVP;
+		return (ESTALE);
+	}
+	*vpp = nvp;
+	return (0);
+}
+
+/*
+ * Initialize UFS filesystems, done only once.
+ */
+void
+ufs_init(void)
+{
+	if (ufs_initcount++ > 0)
+		return;
+
+	ufs_direct_cache = pool_cache_init(sizeof(struct direct), 0, 0, 0,
+	    "ufsdir", NULL, IPL_NONE, NULL, NULL, NULL);
+
+	ufs_ihashinit();
+#if defined(QUOTA) || defined(QUOTA2)
+	dqinit();
+#endif
+#ifdef UFS_DIRHASH
+	ufsdirhash_init();
+#endif
+#ifdef UFS_EXTATTR
+	ufs_extattr_init();
+#endif
+}
+
+void
+ufs_reinit(void)
+{
+	ufs_ihashreinit();
+#if defined(QUOTA) || defined(QUOTA2)
+	dqreinit();
+#endif
+}
+
+/*
+ * Free UFS filesystem resources, done only once.
+ */
+void
+ufs_done(void)
+{
+	if (--ufs_initcount > 0)
+		return;
+
+	ufs_ihashdone();
+#if defined(QUOTA) || defined(QUOTA2)
+	dqdone();
+#endif
+	pool_cache_destroy(ufs_direct_cache);
+#ifdef UFS_DIRHASH
+	ufsdirhash_done();
+#endif
+#ifdef UFS_EXTATTR
+	ufs_extattr_done();
+#endif
+}
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
new file mode 100644
index 000000000..634f96694
--- /dev/null
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -0,0 +1,2989 @@
+/*	$NetBSD: ufs_vnops.c,v 1.206 2011/11/18 21:18:52 christos Exp $	*/
+
+/*-
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_vnops.c	8.28 (Berkeley) 7/31/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.206 2011/11/18 21:18:52 christos Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
+#ifdef UFS_DIRHASH
+#include <ufs/ufs/dirhash.h>
+#endif
+#include <ufs/ext2fs/ext2fs_extern.h>
+#include <ufs/ext2fs/ext2fs_dir.h>
+#include <ufs/ffs/ffs_extern.h>
+#include <ufs/lfs/lfs_extern.h>
+#include <ufs/lfs/lfs.h>
+
+#include <uvm/uvm.h>
+
+__CTASSERT(EXT2FS_MAXNAMLEN == FFS_MAXNAMLEN);
+__CTASSERT(LFS_MAXNAMLEN == FFS_MAXNAMLEN);
+
+static int ufs_chmod(struct vnode *, int, kauth_cred_t, struct lwp *);
+static int ufs_chown(struct vnode *, uid_t, gid_t, kauth_cred_t,
+    struct lwp *);
+
+/*
+ * A virgin directory (no blushing please).
+ */
+static const struct dirtemplate mastertemplate = {
+	0,	12,		DT_DIR,	1,	".",
+	0,	DIRBLKSIZ - 12,	DT_DIR,	2,	".."
+};
+
+/*
+ * Create a regular file
+ */
+int
+ufs_create(void *v)
+{
+	struct vop_create_args /* {
+		struct vnode		*a_dvp;
+		struct vnode		**a_vpp;
+		struct componentname	*a_cnp;
+		struct vattr		*a_vap;
+	} */ *ap = v;
+	int	error;
+	struct vnode *dvp = ap->a_dvp;
+	struct ufs_lookup_results *ulr;
+
+	/* XXX should handle this material another way */
+	ulr = &VTOI(dvp)->i_crap;
+	UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+	/*
+	 * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
+	 * ufs_makeinode
+	 */
+	fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+	error =
+	    ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
+			  dvp, ulr, ap->a_vpp, ap->a_cnp);
+	if (error) {
+		fstrans_done(dvp->v_mount);
+		return (error);
+	}
+	UFS_WAPBL_END1(dvp->v_mount, dvp);
+	fstrans_done(dvp->v_mount);
+	VN_KNOTE(dvp, NOTE_WRITE);
+	return (0);
+}
+
+/*
+ * Mknod vnode call
+ */
+/* ARGSUSED */
+int
+ufs_mknod(void *v)
+{
+	struct vop_mknod_args /* {
+		struct vnode		*a_dvp;
+		struct vnode		**a_vpp;
+		struct componentname	*a_cnp;
+		struct vattr		*a_vap;
+	} */ *ap = v;
+	struct vattr	*vap;
+	struct vnode	**vpp;
+	struct inode	*ip;
+	int		error;
+	struct mount	*mp;
+	ino_t		ino;
+	struct ufs_lookup_results *ulr;
+
+	vap = ap->a_vap;
+	vpp = ap->a_vpp;
+
+	/* XXX should handle this material another way */
+	ulr = &VTOI(ap->a_dvp)->i_crap;
+	UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));
+
+	/*
+	 * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
+	 * ufs_makeinode
+	 */
+	fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED);
+	if ((error =
+	    ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
+	    ap->a_dvp, ulr, vpp, ap->a_cnp)) != 0)
+		goto out;
+	VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+	ip = VTOI(*vpp);
+	mp  = (*vpp)->v_mount;
+	ino = ip->i_number;
+	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+	if (vap->va_rdev != VNOVAL) {
+		struct ufsmount *ump = ip->i_ump;
+		/*
+		 * Want to be able to use this to make badblock
+		 * inodes, so don't truncate the dev number.
+		 */
+		if (ump->um_fstype == UFS1)
+			ip->i_ffs1_rdev = ufs_rw32(vap->va_rdev,
+			    UFS_MPNEEDSWAP(ump));
+		else
+			ip->i_ffs2_rdev = ufs_rw64(vap->va_rdev,
+			    UFS_MPNEEDSWAP(ump));
+	}
+	UFS_WAPBL_UPDATE(*vpp, NULL, NULL, 0);
+	UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp);
+	/*
+	 * Remove inode so that it will be reloaded by VFS_VGET and
+	 * checked to see if it is an alias of an existing entry in
+	 * the inode cache.
+	 */
+	(*vpp)->v_type = VNON;
+	VOP_UNLOCK(*vpp);
+	vgone(*vpp);
+	error = VFS_VGET(mp, ino, vpp);
+out:
+	fstrans_done(ap->a_dvp->v_mount);
+	if (error != 0) {
+		*vpp = NULL;
+		return (error);
+	}
+	return (0);
+}
+
+/*
+ * Open called.
+ *
+ * Nothing to do.
+ */
+/* ARGSUSED */
+int
+ufs_open(void *v)
+{
+	struct vop_open_args /* {
+		struct vnode	*a_vp;
+		int		a_mode;
+		kauth_cred_t	a_cred;
+	} */ *ap = v;
+
+	/*
+	 * Files marked append-only must be opened for appending.
+	 */
+	if ((VTOI(ap->a_vp)->i_flags & APPEND) &&
+	    (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
+		return (EPERM);
+	return (0);
+}
+
+/*
+ * Close called.
+ *
+ * Update the times on the inode.
+ */
+/* ARGSUSED */
+int
+ufs_close(void *v)
+{
+	struct vop_close_args /* {
+		struct vnode	*a_vp;
+		int		a_fflag;
+		kauth_cred_t	a_cred;
+	} */ *ap = v;
+	struct vnode	*vp;
+	struct inode	*ip;
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	fstrans_start(vp->v_mount, FSTRANS_SHARED);
+	if (vp->v_usecount > 1)
+		UFS_ITIMES(vp, NULL, NULL, NULL);
+	fstrans_done(vp->v_mount);
+	return (0);
+}
+
+static int
+ufs_check_possible(struct vnode *vp, struct inode *ip, mode_t mode,
+    kauth_cred_t cred)
+{
+#if defined(QUOTA) || defined(QUOTA2)
+	int error;
+#endif
+
+	/*
+	 * Disallow write attempts on read-only file systems;
+	 * unless the file is a socket, fifo, or a block or
+	 * character device resident on the file system.
+	 */
+	if (mode & VWRITE) {
+		switch (vp->v_type) {
+		case VDIR:
+		case VLNK:
+		case VREG:
+			if (vp->v_mount->mnt_flag & MNT_RDONLY)
+				return (EROFS);
+#if defined(QUOTA) || defined(QUOTA2)
+			fstrans_start(vp->v_mount, FSTRANS_SHARED);
+			error = chkdq(ip, 0, cred, 0);
+			fstrans_done(vp->v_mount);
+			if (error != 0)
+				return error;
+#endif
+			break;
+		case VBAD:
+		case VBLK:
+		case VCHR:
+		case VSOCK:
+		case VFIFO:
+		case VNON:
+		default:
+			break;
+		}
+	}
+
+	/* If it is a snapshot, nobody gets access to it. */
+	if ((ip->i_flags & SF_SNAPSHOT))
+		return (EPERM);
+	/* If immutable bit set, nobody gets to write it. */
+	if ((mode & VWRITE) && (ip->i_flags & IMMUTABLE))
+		return (EPERM);
+
+	return 0;
+}
+
+static int
+ufs_check_permitted(struct vnode *vp, struct inode *ip, mode_t mode,
+    kauth_cred_t cred)
+{
+
+	return genfs_can_access(vp->v_type, ip->i_mode & ALLPERMS, ip->i_uid,
+	    ip->i_gid, mode, cred);
+}
+
+int
+ufs_access(void *v)
+{
+	struct vop_access_args /* {
+		struct vnode	*a_vp;
+		int		a_mode;
+		kauth_cred_t	a_cred;
+	} */ *ap = v;
+	struct vnode	*vp;
+	struct inode	*ip;
+	mode_t		mode;
+	int		error;
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	mode = ap->a_mode;
+
+	error = ufs_check_possible(vp, ip, mode, ap->a_cred);
+	if (error)
+		return error;
+
+	error = ufs_check_permitted(vp, ip, mode, ap->a_cred);
+
+	return error;
+}
+
+/* ARGSUSED */
+int
+ufs_getattr(void *v)
+{
+	struct vop_getattr_args /* {
+		struct vnode	*a_vp;
+		struct vattr	*a_vap;
+		kauth_cred_t	a_cred;
+	} */ *ap = v;
+	struct vnode	*vp;
+	struct inode	*ip;
+	struct vattr	*vap;
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	vap = ap->a_vap;
+	fstrans_start(vp->v_mount, FSTRANS_SHARED);
+	UFS_ITIMES(vp, NULL, NULL, NULL);
+
+	/*
+	 * Copy from inode table
+	 */
+	vap->va_fsid = ip->i_dev;
+	vap->va_fileid = ip->i_number;
+	vap->va_mode = ip->i_mode & ALLPERMS;
+	vap->va_nlink = ip->i_nlink;
+	vap->va_uid = ip->i_uid;
+	vap->va_gid = ip->i_gid;
+	vap->va_size = vp->v_size;
+	if (ip->i_ump->um_fstype == UFS1) {
+		vap->va_rdev = (dev_t)ufs_rw32(ip->i_ffs1_rdev,
+		    UFS_MPNEEDSWAP(ip->i_ump));
+		vap->va_atime.tv_sec = ip->i_ffs1_atime;
+		vap->va_atime.tv_nsec = ip->i_ffs1_atimensec;
+		vap->va_mtime.tv_sec = ip->i_ffs1_mtime;
+		vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec;
+		vap->va_ctime.tv_sec = ip->i_ffs1_ctime;
+		vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec;
+		vap->va_birthtime.tv_sec = 0;
+		vap->va_birthtime.tv_nsec = 0;
+		vap->va_bytes = dbtob((u_quad_t)ip->i_ffs1_blocks);
+	} else {
+		vap->va_rdev = (dev_t)ufs_rw64(ip->i_ffs2_rdev,
+		    UFS_MPNEEDSWAP(ip->i_ump));
+		vap->va_atime.tv_sec = ip->i_ffs2_atime;
+		vap->va_atime.tv_nsec = ip->i_ffs2_atimensec;
+		vap->va_mtime.tv_sec = ip->i_ffs2_mtime;
+		vap->va_mtime.tv_nsec = ip->i_ffs2_mtimensec;
+		vap->va_ctime.tv_sec = ip->i_ffs2_ctime;
+		vap->va_ctime.tv_nsec = ip->i_ffs2_ctimensec;
+		vap->va_birthtime.tv_sec = ip->i_ffs2_birthtime;
+		vap->va_birthtime.tv_nsec = ip->i_ffs2_birthnsec;
+		vap->va_bytes = dbtob(ip->i_ffs2_blocks);
+	}
+	vap->va_gen = ip->i_gen;
+	vap->va_flags = ip->i_flags;
+
+	/* this doesn't belong here */
+	if (vp->v_type == VBLK)
+		vap->va_blocksize = BLKDEV_IOSIZE;
+	else if (vp->v_type == VCHR)
+		vap->va_blocksize = MAXBSIZE;
+	else
+		vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+	vap->va_type = vp->v_type;
+	vap->va_filerev = ip->i_modrev;
+	fstrans_done(vp->v_mount);
+	return (0);
+}
+
+/*
+ * Set attribute vnode op. called from several syscalls
+ */
+int
+ufs_setattr(void *v)
+{
+	struct vop_setattr_args /* {
+		struct vnode	*a_vp;
+		struct vattr	*a_vap;
+		kauth_cred_t	a_cred;
+	} */ *ap = v;
+	struct vattr	*vap;
+	struct vnode	*vp;
+	struct inode	*ip;
+	kauth_cred_t	cred;
+	struct lwp	*l;
+	int		error;
+
+	vap = ap->a_vap;
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	cred = ap->a_cred;
+	l = curlwp;
+
+	/*
+	 * Check for unsettable attributes.
+	 */
+	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
+	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
+	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
+	    ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
+		return (EINVAL);
+	}
+
+	fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+	if (vap->va_flags != VNOVAL) {
+		if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+			error = EROFS;
+			goto out;
+		}
+		if (kauth_cred_geteuid(cred) != ip->i_uid &&
+		    (error = kauth_authorize_generic(cred,
+		    KAUTH_GENERIC_ISSUSER, NULL)))
+			goto out;
+		if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+		    NULL) == 0) {
+			if ((ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) &&
+			    kauth_authorize_system(l->l_cred,
+			     KAUTH_SYSTEM_CHSYSFLAGS, 0, NULL, NULL, NULL)) {
+				error = EPERM;
+				goto out;
+			}
+			/* Snapshot flag cannot be set or cleared */
+			if ((vap->va_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
+			    (ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))) {
+				error = EPERM;
+				goto out;
+			}
+			error = UFS_WAPBL_BEGIN(vp->v_mount);
+			if (error)
+				goto out;
+			ip->i_flags = vap->va_flags;
+			DIP_ASSIGN(ip, flags, ip->i_flags);
+		} else {
+			if ((ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) ||
+			    (vap->va_flags & UF_SETTABLE) != vap->va_flags) {
+				error = EPERM;
+				goto out;
+			}
+			if ((ip->i_flags & SF_SETTABLE) !=
+			    (vap->va_flags & SF_SETTABLE)) {
+				error = EPERM;
+				goto out;
+			}
+			error = UFS_WAPBL_BEGIN(vp->v_mount);
+			if (error)
+				goto out;
+			ip->i_flags &= SF_SETTABLE;
+			ip->i_flags |= (vap->va_flags & UF_SETTABLE);
+			DIP_ASSIGN(ip, flags, ip->i_flags);
+		}
+		ip->i_flag |= IN_CHANGE;
+		UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+		UFS_WAPBL_END(vp->v_mount);
+		if (vap->va_flags & (IMMUTABLE | APPEND)) {
+			error = 0;
+			goto out;
+		}
+	}
+	if (ip->i_flags & (IMMUTABLE | APPEND)) {
+		error = EPERM;
+		goto out;
+	}
+	/*
+	 * Go through the fields and update iff not VNOVAL.
+	 */
+	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
+		if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+			error = EROFS;
+			goto out;
+		}
+		error = UFS_WAPBL_BEGIN(vp->v_mount);
+		if (error)
+			goto out;
+		error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, l);
+		UFS_WAPBL_END(vp->v_mount);
+		if (error)
+			goto out;
+	}
+	if (vap->va_size != VNOVAL) {
+		/*
+		 * Disallow write attempts on read-only file systems;
+		 * unless the file is a socket, fifo, or a block or
+		 * character device resident on the file system.
+		 */
+		switch (vp->v_type) {
+		case VDIR:
+			error = EISDIR;
+			goto out;
+		case VCHR:
+		case VBLK:
+		case VFIFO:
+			break;
+		case VREG:
+			if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+				error = EROFS;
+				goto out;
+			}
+			if ((ip->i_flags & SF_SNAPSHOT) != 0) {
+				error = EPERM;
+				goto out;
+			}
+			error = UFS_WAPBL_BEGIN(vp->v_mount);
+			if (error)
+				goto out;
+			/*
+			 * When journaling, only truncate one indirect block
+			 * at a time.
+			 */
+			if (vp->v_mount->mnt_wapbl) {
+				uint64_t incr = MNINDIR(ip->i_ump) <<
+				    vp->v_mount->mnt_fs_bshift; /* Power of 2 */
+				uint64_t base = NDADDR <<
+				    vp->v_mount->mnt_fs_bshift;
+				while (!error && ip->i_size > base + incr &&
+				    ip->i_size > vap->va_size + incr) {
+					/*
+					 * round down to next full indirect
+					 * block boundary.
+					 */
+					uint64_t nsize = base +
+					    ((ip->i_size - base - 1) &
+					    ~(incr - 1));
+					error = UFS_TRUNCATE(vp, nsize, 0,
+					    cred);
+					if (error == 0) {
+						UFS_WAPBL_END(vp->v_mount);
+						error =
+						   UFS_WAPBL_BEGIN(vp->v_mount);
+					}
+				}
+			}
+			if (!error)
+				error = UFS_TRUNCATE(vp, vap->va_size, 0, cred);
+			UFS_WAPBL_END(vp->v_mount);
+			if (error)
+				goto out;
+			break;
+		default:
+			error = EOPNOTSUPP;
+			goto out;
+		}
+	}
+	ip = VTOI(vp);
+	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL ||
+	    vap->va_birthtime.tv_sec != VNOVAL) {
+		if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+			error = EROFS;
+			goto out;
+		}
+		if ((ip->i_flags & SF_SNAPSHOT) != 0) {
+			error = EPERM;
+			goto out;
+		}
+		error = genfs_can_chtimes(vp, vap->va_vaflags, ip->i_uid, cred);
+		if (error)
+			goto out;
+		error = UFS_WAPBL_BEGIN(vp->v_mount);
+		if (error)
+			goto out;
+		if (vap->va_atime.tv_sec != VNOVAL)
+			if (!(vp->v_mount->mnt_flag & MNT_NOATIME))
+				ip->i_flag |= IN_ACCESS;
+		if (vap->va_mtime.tv_sec != VNOVAL) {
+			ip->i_flag |= IN_CHANGE | IN_UPDATE;
+			if (vp->v_mount->mnt_flag & MNT_RELATIME)
+				ip->i_flag |= IN_ACCESS;
+		}
+		if (vap->va_birthtime.tv_sec != VNOVAL &&
+		    ip->i_ump->um_fstype == UFS2) {
+			ip->i_ffs2_birthtime = vap->va_birthtime.tv_sec;
+			ip->i_ffs2_birthnsec = vap->va_birthtime.tv_nsec;
+		}
+		error = UFS_UPDATE(vp, &vap->va_atime, &vap->va_mtime, 0);
+		UFS_WAPBL_END(vp->v_mount);
+		if (error)
+			goto out;
+	}
+	error = 0;
+	if (vap->va_mode != (mode_t)VNOVAL) {
+		if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+			error = EROFS;
+			goto out;
+		}
+		if ((ip->i_flags & SF_SNAPSHOT) != 0 &&
+		    (vap->va_mode & (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP |
+		     S_IXOTH | S_IWOTH))) {
+			error = EPERM;
+			goto out;
+		}
+		error = UFS_WAPBL_BEGIN(vp->v_mount);
+		if (error)
+			goto out;
+		error = ufs_chmod(vp, (int)vap->va_mode, cred, l);
+		UFS_WAPBL_END(vp->v_mount);
+	}
+	VN_KNOTE(vp, NOTE_ATTRIB);
+out:
+	fstrans_done(vp->v_mount);
+	return (error);
+}
+
+/*
+ * Change the mode on a file.
+ * Inode must be locked before calling.
+ */
+static int
+ufs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l)
+{
+	struct inode	*ip;
+	int		error;
+
+	UFS_WAPBL_JLOCK_ASSERT(vp->v_mount);
+
+	ip = VTOI(vp);
+
+	error = genfs_can_chmod(vp, cred, ip->i_uid, ip->i_gid, mode);
+	if (error)
+		return (error);
+
+	fstrans_start(vp->v_mount, FSTRANS_SHARED);
+	ip->i_mode &= ~ALLPERMS;
+	ip->i_mode |= (mode & ALLPERMS);
+	ip->i_flag |= IN_CHANGE;
+	DIP_ASSIGN(ip, mode, ip->i_mode);
+	UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+	fstrans_done(vp->v_mount);
+	return (0);
+}
+
+/*
+ * Perform chown operation on inode ip;
+ * inode must be locked prior to call.
+ */
+static int
+ufs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred,
+    	struct lwp *l)
+{
+	struct inode	*ip;
+	int		error = 0;
+#if defined(QUOTA) || defined(QUOTA2)
+	uid_t		ouid;
+	gid_t		ogid;
+	int64_t		change;
+#endif
+	ip = VTOI(vp);
+	error = 0;
+
+	if (uid == (uid_t)VNOVAL)
+		uid = ip->i_uid;
+	if (gid == (gid_t)VNOVAL)
+		gid = ip->i_gid;
+
+	error = genfs_can_chown(vp, cred, ip->i_uid, ip->i_gid, uid, gid);
+	if (error)
+		return (error);
+
+	fstrans_start(vp->v_mount, FSTRANS_SHARED);
+#if defined(QUOTA) || defined(QUOTA2)
+	ogid = ip->i_gid;
+	ouid = ip->i_uid;
+	change = DIP(ip, blocks);
+	(void) chkdq(ip, -change, cred, 0);
+	(void) chkiq(ip, -1, cred, 0);
+#endif
+	ip->i_gid = gid;
+	DIP_ASSIGN(ip, gid, gid);
+	ip->i_uid = uid;
+	DIP_ASSIGN(ip, uid, uid);
+#if defined(QUOTA) || defined(QUOTA2)
+	if ((error = chkdq(ip, change, cred, 0)) == 0) {
+		if ((error = chkiq(ip, 1, cred, 0)) == 0)
+			goto good;
+		else
+			(void) chkdq(ip, -change, cred, FORCE);
+	}
+	ip->i_gid = ogid;
+	DIP_ASSIGN(ip, gid, ogid);
+	ip->i_uid = ouid;
+	DIP_ASSIGN(ip, uid, ouid);
+	(void) chkdq(ip, change, cred, FORCE);
+	(void) chkiq(ip, 1, cred, FORCE);
+	fstrans_done(vp->v_mount);
+	return (error);
+ good:
+#endif /* QUOTA || QUOTA2 */
+	ip->i_flag |= IN_CHANGE;
+	UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+	fstrans_done(vp->v_mount);
+	return (0);
+}
+
+int
+ufs_remove(void *v)
+{
+	struct vop_remove_args /* {
+		struct vnode		*a_dvp;
+		struct vnode		*a_vp;
+		struct componentname	*a_cnp;
+	} */ *ap = v;
+	struct vnode	*vp, *dvp;
+	struct inode	*ip;
+	int		error;
+	struct ufs_lookup_results *ulr;
+
+	vp = ap->a_vp;
+	dvp = ap->a_dvp;
+	ip = VTOI(vp);
+
+	/* XXX should handle this material another way */
+	ulr = &VTOI(dvp)->i_crap;
+	UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+	fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+	if (vp->v_type == VDIR || (ip->i_flags & (IMMUTABLE | APPEND)) ||
+	    (VTOI(dvp)->i_flags & APPEND))
+		error = EPERM;
+	else {
+		error = UFS_WAPBL_BEGIN(dvp->v_mount);
+		if (error == 0) {
+			error = ufs_dirremove(dvp, ulr,
+					      ip, ap->a_cnp->cn_flags, 0);
+			UFS_WAPBL_END(dvp->v_mount);
+		}
+	}
+	VN_KNOTE(vp, NOTE_DELETE);
+	VN_KNOTE(dvp, NOTE_WRITE);
+	if (dvp == vp)
+		vrele(vp);
+	else
+		vput(vp);
+	vput(dvp);
+	fstrans_done(dvp->v_mount);
+	return (error);
+}
+
+/*
+ * ufs_link: create hard link.
+ */
+int
+ufs_link(void *v)
+{
+	struct vop_link_args /* {
+		struct vnode *a_dvp;
+		struct vnode *a_vp;
+		struct componentname *a_cnp;
+	} */ *ap = v;
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode *vp = ap->a_vp;
+	struct componentname *cnp = ap->a_cnp;
+	struct inode *ip;
+	struct direct *newdir;
+	int error;
+	struct ufs_lookup_results *ulr;
+
+	KASSERT(dvp != vp);
+	KASSERT(vp->v_type != VDIR);
+	KASSERT(dvp->v_mount == vp->v_mount);
+
+	/* XXX should handle this material another way */
+	ulr = &VTOI(dvp)->i_crap;
+	UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+	fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+	error = vn_lock(vp, LK_EXCLUSIVE);
+	if (error) {
+		VOP_ABORTOP(dvp, cnp);
+		goto out2;
+	}
+	ip = VTOI(vp);
+	if ((nlink_t)ip->i_nlink >= LINK_MAX) {
+		VOP_ABORTOP(dvp, cnp);
+		error = EMLINK;
+		goto out1;
+	}
+	if (ip->i_flags & (IMMUTABLE | APPEND)) {
+		VOP_ABORTOP(dvp, cnp);
+		error = EPERM;
+		goto out1;
+	}
+	error = UFS_WAPBL_BEGIN(vp->v_mount);
+	if (error) {
+		VOP_ABORTOP(dvp, cnp);
+		goto out1;
+	}
+	ip->i_nlink++;
+	DIP_ASSIGN(ip, nlink, ip->i_nlink);
+	ip->i_flag |= IN_CHANGE;
+	error = UFS_UPDATE(vp, NULL, NULL, UPDATE_DIROP);
+	if (!error) {
+		newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+		ufs_makedirentry(ip, cnp, newdir);
+		error = ufs_direnter(dvp, ulr, vp, newdir, cnp, NULL);
+		pool_cache_put(ufs_direct_cache, newdir);
+	}
+	if (error) {
+		ip->i_nlink--;
+		DIP_ASSIGN(ip, nlink, ip->i_nlink);
+		ip->i_flag |= IN_CHANGE;
+		UFS_WAPBL_UPDATE(vp, NULL, NULL, UPDATE_DIROP);
+	}
+	UFS_WAPBL_END(vp->v_mount);
+ out1:
+	VOP_UNLOCK(vp);
+ out2:
+	VN_KNOTE(vp, NOTE_LINK);
+	VN_KNOTE(dvp, NOTE_WRITE);
+	vput(dvp);
+	fstrans_done(dvp->v_mount);
+	return (error);
+}
+
+/*
+ * whiteout vnode call
+ */
+int
+ufs_whiteout(void *v)
+{
+	struct vop_whiteout_args /* {
+		struct vnode		*a_dvp;
+		struct componentname	*a_cnp;
+		int			a_flags;
+	} */ *ap = v;
+	struct vnode		*dvp = ap->a_dvp;
+	struct componentname	*cnp = ap->a_cnp;
+	struct direct		*newdir;
+	int			error;
+	struct ufsmount		*ump = VFSTOUFS(dvp->v_mount);
+	struct ufs_lookup_results *ulr;
+
+	/* XXX should handle this material another way */
+	ulr = &VTOI(dvp)->i_crap;
+	UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+	error = 0;
+	switch (ap->a_flags) {
+	case LOOKUP:
+		/* 4.4 format directories support whiteout operations */
+		if (ump->um_maxsymlinklen > 0)
+			return (0);
+		return (EOPNOTSUPP);
+
+	case CREATE:
+		/* create a new directory whiteout */
+		fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+		error = UFS_WAPBL_BEGIN(dvp->v_mount);
+		if (error)
+			break;
+#ifdef DIAGNOSTIC
+		if (ump->um_maxsymlinklen <= 0)
+			panic("ufs_whiteout: old format filesystem");
+#endif
+
+		newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+		newdir->d_ino = WINO;
+		newdir->d_namlen = cnp->cn_namelen;
+		memcpy(newdir->d_name, cnp->cn_nameptr,
+		    (size_t)cnp->cn_namelen);
+		newdir->d_name[cnp->cn_namelen] = '\0';
+		newdir->d_type = DT_WHT;
+		error = ufs_direnter(dvp, ulr, NULL, newdir, cnp, NULL);
+		pool_cache_put(ufs_direct_cache, newdir);
+		break;
+
+	case DELETE:
+		/* remove an existing directory whiteout */
+		fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+		error = UFS_WAPBL_BEGIN(dvp->v_mount);
+		if (error)
+			break;
+#ifdef DIAGNOSTIC
+		if (ump->um_maxsymlinklen <= 0)
+			panic("ufs_whiteout: old format filesystem");
+#endif
+
+		cnp->cn_flags &= ~DOWHITEOUT;
+		error = ufs_dirremove(dvp, ulr, NULL, cnp->cn_flags, 0);
+		break;
+	default:
+		panic("ufs_whiteout: unknown op");
+		/* NOTREACHED */
+	}
+	UFS_WAPBL_END(dvp->v_mount);
+	fstrans_done(dvp->v_mount);
+	return (error);
+}
+
+
+/*
+ * Rename vnode operation
+ * 	rename("foo", "bar");
+ * is essentially
+ *	unlink("bar");
+ *	link("foo", "bar");
+ *	unlink("foo");
+ * but ``atomically''.  Can't do full commit without saving state in the
+ * inode on disk which isn't feasible at this time.  Best we can do is
+ * always guarantee the target exists.
+ *
+ * Basic algorithm is:
+ *
+ * 1) Bump link count on source while we're linking it to the
+ *    target.  This also ensure the inode won't be deleted out
+ *    from underneath us while we work (it may be truncated by
+ *    a concurrent `trunc' or `open' for creation).
+ * 2) Link source to destination.  If destination already exists,
+ *    delete it first.
+ * 3) Unlink source reference to inode if still around. If a
+ *    directory was moved and the parent of the destination
+ *    is different from the source, patch the ".." entry in the
+ *    directory.
+ */
+
+/*
+ * Notes on rename locking:
+ *
+ * We lock parent vnodes before child vnodes. This means in particular
+ * that if A is above B in the directory tree then A must be locked
+ * before B. (This is true regardless of how many steps appear in
+ * between, because an arbitrary number of other processes could lock
+ * parent/child in between and establish a lock cycle and deadlock.)
+ *
+ * Therefore, if tdvp is above fdvp we must lock tdvp first; if fdvp
+ * is above tdvp we must lock fdvp first; and if they're
+ * incommensurate it doesn't matter. (But, we rely on the fact that
+ * there's a whole-volume rename lock to prevent deadlock among groups
+ * of renames upon overlapping sets of incommensurate vnodes.)
+ *
+ * In addition to establishing lock ordering the parent check also
+ * serves to rule out cases where someone tries to move a directory
+ * underneath itself, e.g. rename("a/b", "a/b/c"). If allowed to
+ * proceed such renames would detach portions of the directory tree
+ * and make fsck very unhappy.
+ *
+ * Note that it is an error for *fvp* to be above tdvp; however,
+ * *fdvp* can be above tdvp, as in rename("a/b", "a/c/d").
+ *
+ * The parent check searches up the tree from tdvp until it either
+ * finds fdvp or the root of the volume. It also returns the vnode it
+ * saw immediately before fdvp, if any. Later on (after looking up
+ * fvp) we will check to see if this *is* fvp and if so fail.
+ *
+ * If the parent check finds fdvp, it means fdvp is above tdvp, so we
+ * lock fdvp first and then tdvp. Otherwise, either tdvp is above fdvp
+ * or they're incommensurate and we lock tdvp first.
+ *
+ * In either case each of the child vnodes has to be looked up and
+ * locked immediately after its parent. The cases
+ *
+ *       fdvp/fvp/[.../]tdvp/tvp
+ *       tdvp/tvp/[.../]fdvp/fvp
+ *
+ * can cause deadlock otherwise. Note that both of these are error
+ * cases; the first fails the parent check and the second fails
+ * because tvp isn't empty. The parent check case is handled before
+ * we start locking; however, the nonempty case requires locking tvp
+ * to find out safely that it's nonempty.
+ *
+ * Therefore the procedure is either
+ *
+ *   lock fdvp
+ *   lookup fvp
+ *   lock fvp
+ *   lock tdvp
+ *   lookup tvp
+ *   lock tvp
+ *
+ * or
+ *
+ *   lock tdvp
+ *   lookup tvp
+ *   lock tvp
+ *   lock fdvp
+ *   lookup fvp
+ *   lock fvp
+ *
+ * This could in principle be simplified by always looking up fvp
+ * last; because of the parent check we know by the time we start
+ * locking that fvp cannot be directly above tdvp, so (given the
+ * whole-volume rename lock and other assumptions) it's safe to lock
+ * tdvp before fvp. This would allow the following scheme:
+ *
+ *   lock fdvp
+ *   lock tdvp
+ * or
+ *   lock tdvp
+ *   lock fdvp
+ *
+ * then
+ *   lookup tvp
+ *   lock tvp
+ *   lookup fvp
+ *   check if fvp is above of tdvp, fail if so
+ *   lock fvp
+ *
+ * which is much, much simpler.
+ *
+ * However, current levels of vfs namei/lookup sanity do not permit
+ * this. It is impossible currently to look up fvp without locking it.
+ * (It gets locked regardless of whether LOCKLEAF is set; without
+ * LOCKLEAF it just gets unlocked again, which doesn't help.)
+ *
+ * Therefore, because we must look up fvp to know if it's above tdvp,
+ * which locks fvp, we must, at least in the case where fdvp is above
+ * tdvp, do that before locking tdvp. The longer scheme does that; the
+ * simpler scheme is not safe.
+ *
+ * Note that for now we aren't doing lookup() but relookup(); however,
+ * the differences are minor.
+ *
+ * On top of all the above, just to make everything more
+ * exciting, any two of the vnodes might end up being the same.
+ *
+ * FROMPARENT == FROMCHILD	mv a/. foo	is an error.
+ * FROMPARENT == TOPARENT	mv a/b a/c	is ok.
+ * FROMPARENT == TOCHILD	mv a/b/c a/b	will give ENOTEMPTY.
+ * FROMCHILD == TOPARENT	mv a/b a/b/c	fails the parent check.
+ * FROMCHILD == TOCHILD		mv a/b a/b	is ok.
+ * TOPARENT == TOCHILD		mv foo a/.	is an error.
+ *
+ * This introduces more cases in the locking, because each distinct
+ * vnode must be locked exactly once.
+ *
+ * When FROMPARENT == TOPARENT and FROMCHILD != TOCHILD we assume it
+ * doesn't matter what order the children are locked in, because the
+ * per-volume rename lock excludes other renames and no other
+ * operation locks two files in the same directory at once. (Note: if
+ * it turns out that link() does, link() is wrong.)
+ *
+ * Until such time as we can do lookups without the namei and lookup
+ * machinery "helpfully" locking the result vnode for us, we can't
+ * avoid tripping on cases where FROMCHILD == TOCHILD. Currently for
+ * non-directories we unlock the first one we lock while looking up
+ * the second, then relock it if necessary. This is more or less
+ * harmless since not much of interest can happen to the objects in
+ * that window while we have the containing directory locked; but it's
+ * not desirable and should be cleaned up when that becomes possible.
+ * The right way to do it is to check after looking the second one up
+ * and only lock it if it's different. (Note: for directories we don't
+ * do this dance because the same directory can't appear more than
+ * once.)
+ */
+
+/* XXX following lifted from ufs_lookup.c */
+#define	FSFMT(vp)	(((vp)->v_mount->mnt_iflag & IMNT_DTYPE) == 0)
+
+/*
+ * Check if either entry referred to by FROM_ULR is within the range
+ * of entries named by TO_ULR.
+ */
+static int
+ulr_overlap(const struct ufs_lookup_results *from_ulr,
+	    const struct ufs_lookup_results *to_ulr)
+{
+	doff_t from_start, from_prevstart;
+	doff_t to_start, to_end;
+
+	/*
+	 * FROM is a DELETE result; offset points to the entry to
+	 * remove and subtracting count gives the previous entry.
+	 */
+	from_start = from_ulr->ulr_offset - from_ulr->ulr_count;
+	from_prevstart = from_ulr->ulr_offset;
+
+	/*
+	 * TO is a RENAME (thus non-DELETE) result; offset points
+	 * to the beginning of a region to write in, and adding
+	 * count gives the end of the region.
+	 */
+	to_start = to_ulr->ulr_offset;
+	to_end = to_ulr->ulr_offset + to_ulr->ulr_count;
+
+	if (from_prevstart >= to_start && from_prevstart < to_end) {
+		return 1;
+	}
+	if (from_start >= to_start && from_start < to_end) {
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Wrapper for relookup that also updates the supplemental results.
+ */
+static int
+do_relookup(struct vnode *dvp, struct ufs_lookup_results *ulr,
+	    struct vnode **vp, struct componentname *cnp)
+{
+	int error;
+
+	error = relookup(dvp, vp, cnp, 0);
+	if (error) {
+		return error;
+	}
+	/* update the supplemental reasults */
+	*ulr = VTOI(dvp)->i_crap;
+	UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+	return 0;
+}
+
+/*
+ * Lock and relookup a sequence of two directories and two children.
+ *
+ */
+static int
+lock_vnode_sequence(struct vnode *d1, struct ufs_lookup_results *ulr1,
+		    struct vnode **v1_ret, struct componentname *cn1, 
+		    int v1_missing_ok,
+		    int overlap_error,
+		    struct vnode *d2, struct ufs_lookup_results *ulr2,
+		    struct vnode **v2_ret, struct componentname *cn2, 
+		    int v2_missing_ok)
+{
+	struct vnode *v1, *v2;
+	int error;
+
+	KASSERT(d1 != d2);
+
+	vn_lock(d1, LK_EXCLUSIVE | LK_RETRY);
+	if (VTOI(d1)->i_size == 0) {
+		/* d1 has been rmdir'd */
+		VOP_UNLOCK(d1);
+		return ENOENT;
+	}
+	error = do_relookup(d1, ulr1, &v1, cn1);
+	if (v1_missing_ok) {
+		if (error == ENOENT) {
+			/*
+			 * Note: currently if the name doesn't exist,
+			 * relookup succeeds (it intercepts the
+			 * EJUSTRETURN from VOP_LOOKUP) and sets tvp
+			 * to NULL. Therefore, we will never get
+			 * ENOENT and this branch is not needed.
+			 * However, in a saner future the EJUSTRETURN
+			 * garbage will go away, so let's DTRT.
+			 */
+			v1 = NULL;
+			error = 0;
+		}
+	} else {
+		if (error == 0 && v1 == NULL) {
+			/* This is what relookup sets if v1 disappeared. */
+			error = ENOENT;
+		}
+	}
+	if (error) {
+		VOP_UNLOCK(d1);
+		return error;
+	}
+	if (v1 && v1 == d2) {
+		VOP_UNLOCK(d1);
+		VOP_UNLOCK(v1);
+		vrele(v1);
+		return overlap_error;
+	}
+
+	/*
+	 * The right way to do this is to do lookups without locking
+	 * the results, and lock the results afterwards; then at the
+	 * end we can avoid trying to lock v2 if v2 == v1.
+	 *
+	 * However, for the reasons described in the fdvp == tdvp case
+	 * in rename below, we can't do that safely. So, in the case
+	 * where v1 is not a directory, unlock it and lock it again
+	 * afterwards. This is safe in locking order because a
+	 * non-directory can't be above anything else in the tree. If
+	 * v1 *is* a directory, that's not true, but then because d1
+	 * != d2, v1 != v2.
+	 */
+	if (v1 && v1->v_type != VDIR) {
+		VOP_UNLOCK(v1);
+	}
+	vn_lock(d2, LK_EXCLUSIVE | LK_RETRY);
+	if (VTOI(d2)->i_size == 0) {
+		/* d2 has been rmdir'd */
+		VOP_UNLOCK(d2);
+		if (v1 && v1->v_type == VDIR) {
+			VOP_UNLOCK(v1);
+		}
+		VOP_UNLOCK(d1);
+		if (v1) {
+			vrele(v1);
+		}
+		return ENOENT;
+	}
+	error = do_relookup(d2, ulr2, &v2, cn2);
+	if (v2_missing_ok) {
+		if (error == ENOENT) {
+			/* as above */
+			v2 = NULL;
+			error = 0;
+		}
+	} else {
+		if (error == 0 && v2 == NULL) {
+			/* This is what relookup sets if v2 disappeared. */
+			error = ENOENT;
+		}
+	}
+	if (error) {
+		VOP_UNLOCK(d2);
+		if (v1 && v1->v_type == VDIR) {
+			VOP_UNLOCK(v1);
+		}
+		VOP_UNLOCK(d1);
+		if (v1) {
+			vrele(v1);
+		}
+		return error;
+	}
+	if (v1 && v1->v_type != VDIR && v1 != v2) {
+		vn_lock(v1, LK_EXCLUSIVE | LK_RETRY);
+	}
+	*v1_ret = v1;
+	*v2_ret = v2;
+	return 0;
+}
+
+/*
+ * Rename vnode operation
+ * 	rename("foo", "bar");
+ * is essentially
+ *	unlink("bar");
+ *	link("foo", "bar");
+ *	unlink("foo");
+ * but ``atomically''.  Can't do full commit without saving state in the
+ * inode on disk which isn't feasible at this time.  Best we can do is
+ * always guarantee the target exists.
+ *
+ * Basic algorithm is:
+ *
+ * 1) Bump link count on source while we're linking it to the
+ *    target.  This also ensure the inode won't be deleted out
+ *    from underneath us while we work (it may be truncated by
+ *    a concurrent `trunc' or `open' for creation).
+ * 2) Link source to destination.  If destination already exists,
+ *    delete it first.
+ * 3) Unlink source reference to inode if still around. If a
+ *    directory was moved and the parent of the destination
+ *    is different from the source, patch the ".." entry in the
+ *    directory.
+ */
+int
+ufs_rename(void *v)
+{
+	struct vop_rename_args  /* {
+		struct vnode		*a_fdvp;
+		struct vnode		*a_fvp;
+		struct componentname	*a_fcnp;
+		struct vnode		*a_tdvp;
+		struct vnode		*a_tvp;
+		struct componentname	*a_tcnp;
+	} */ *ap = v;
+	struct vnode		*tvp, *tdvp, *fvp, *fdvp;
+	struct componentname	*tcnp, *fcnp;
+	struct inode		*ip, *txp, *fxp, *tdp, *fdp;
+	struct mount		*mp;
+	struct direct		*newdir;
+	int			doingdirectory, error;
+	ino_t			oldparent, newparent;
+
+	struct ufs_lookup_results from_ulr, to_ulr;
+
+	tvp = ap->a_tvp;
+	tdvp = ap->a_tdvp;
+	fvp = ap->a_fvp;
+	fdvp = ap->a_fdvp;
+	tcnp = ap->a_tcnp;
+	fcnp = ap->a_fcnp;
+	doingdirectory = error = 0;
+	oldparent = newparent = 0;
+
+	/* save the supplemental lookup results as they currently exist */
+	from_ulr = VTOI(fdvp)->i_crap;
+	to_ulr = VTOI(tdvp)->i_crap;
+	UFS_CHECK_CRAPCOUNTER(VTOI(fdvp));
+	UFS_CHECK_CRAPCOUNTER(VTOI(tdvp));
+
+	/*
+	 * Owing to VFS oddities we are currently called with tdvp/tvp
+	 * locked and not fdvp/fvp. In a sane world we'd be passed
+	 * tdvp and fdvp only, unlocked, and two name strings. Pretend
+	 * we have a sane world and unlock tdvp and tvp.
+	 */
+	VOP_UNLOCK(tdvp);
+	if (tvp && tvp != tdvp) {
+		VOP_UNLOCK(tvp);
+	}
+
+	/* Also pretend we have a sane world and vrele fvp/tvp. */
+	vrele(fvp);
+	fvp = NULL;
+	if (tvp) {
+		vrele(tvp);
+		tvp = NULL;
+	}
+
+	/*
+	 * Check for cross-device rename.
+	 */
+	if (fdvp->v_mount != tdvp->v_mount) {
+		error = EXDEV;
+		goto abort;
+	}
+
+	/*
+	 * Reject "." and ".."
+	 */
+	if ((fcnp->cn_flags & ISDOTDOT) || (tcnp->cn_flags & ISDOTDOT) ||
+	    (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
+	    (tcnp->cn_namelen == 1 && tcnp->cn_nameptr[0] == '.')) {
+		error = EINVAL;
+		goto abort;
+	}
+	    
+	/*
+	 * Get locks.
+	 */
+
+	/* paranoia */
+	fcnp->cn_flags |= LOCKPARENT|LOCKLEAF;
+	tcnp->cn_flags |= LOCKPARENT|LOCKLEAF;
+
+	if (fdvp == tdvp) {
+		/* One directory. Lock it and relookup both children. */
+		vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+
+		if (VTOI(fdvp)->i_size == 0) {
+			/* directory has been rmdir'd */
+			VOP_UNLOCK(fdvp);
+			error = ENOENT;
+			goto abort;
+		}
+
+		error = do_relookup(fdvp, &from_ulr, &fvp, fcnp);
+		if (error == 0 && fvp == NULL) {
+			/* relookup may produce this if fvp disappears */
+			error = ENOENT;
+		}
+		if (error) {
+			VOP_UNLOCK(fdvp);
+			goto abort;
+		}
+
+		/*
+		 * The right way to do this is to look up both children
+		 * without locking either, and then lock both unless they
+		 * turn out to be the same. However, due to deep-seated
+		 * VFS-level issues all lookups lock the child regardless
+		 * of whether LOCKLEAF is set (if LOCKLEAF is not set,
+		 * the child is locked during lookup and then unlocked)
+		 * so it is not safe to look up tvp while fvp is locked.
+		 *
+		 * Unlocking fvp here temporarily is more or less safe,
+		 * because with the directory locked there's not much
+		 * that can happen to it. However, ideally it wouldn't
+		 * be necessary. XXX.
+		 */
+		VOP_UNLOCK(fvp);
+		/* remember fdvp == tdvp so tdvp is locked */
+		error = do_relookup(tdvp, &to_ulr, &tvp, tcnp);
+		if (error && error != ENOENT) {
+			VOP_UNLOCK(fdvp);
+			goto abort;
+		}
+		if (error == ENOENT) {
+			/*
+			 * Note: currently if the name doesn't exist,
+			 * relookup succeeds (it intercepts the
+			 * EJUSTRETURN from VOP_LOOKUP) and sets tvp
+			 * to NULL. Therefore, we will never get
+			 * ENOENT and this branch is not needed.
+			 * However, in a saner future the EJUSTRETURN
+			 * garbage will go away, so let's DTRT.
+			 */
+			tvp = NULL;
+		}
+
+		/* tvp is locked; lock fvp if necessary */
+		if (!tvp || tvp != fvp) {
+			vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY);
+		}
+	} else {
+		int found_fdvp;
+		struct vnode *illegal_fvp;
+
+		/*
+		 * The source must not be above the destination. (If
+		 * it were, the rename would detach a section of the
+		 * tree.)
+		 *
+		 * Look up the tree from tdvp to see if we find fdvp,
+		 * and if so, return the immediate child of fdvp we're
+		 * under; that must not turn out to be the same as
+		 * fvp.
+		 *
+		 * The per-volume rename lock guarantees that the
+		 * result of this check remains true until we finish
+		 * looking up and locking.
+		 */
+		error = ufs_parentcheck(fdvp, tdvp, fcnp->cn_cred,
+					&found_fdvp, &illegal_fvp);
+		if (error) {
+			goto abort;
+		}
+
+		/* Must lock in tree order. */
+
+		if (found_fdvp) {
+			/* fdvp -> fvp -> tdvp -> tvp */
+			error = lock_vnode_sequence(fdvp, &from_ulr,
+						    &fvp, fcnp, 0,
+						    EINVAL,
+						    tdvp, &to_ulr,
+						    &tvp, tcnp, 1);
+		} else {
+			/* tdvp -> tvp -> fdvp -> fvp */
+			error = lock_vnode_sequence(tdvp, &to_ulr,
+						    &tvp, tcnp, 1,
+						    ENOTEMPTY,
+						    fdvp, &from_ulr,
+						    &fvp, fcnp, 0);
+		}
+		if (error) {
+			if (illegal_fvp) {
+				vrele(illegal_fvp);
+			}
+			goto abort;
+		}
+		KASSERT(fvp != NULL);
+
+		if (illegal_fvp && fvp == illegal_fvp) {
+			vrele(illegal_fvp);
+			error = EINVAL;
+			goto abort_withlocks;
+		}
+
+		if (illegal_fvp) {
+			vrele(illegal_fvp);
+		}
+	}
+
+	KASSERT(fdvp && VOP_ISLOCKED(fdvp));
+	KASSERT(fvp && VOP_ISLOCKED(fvp));
+	KASSERT(tdvp && VOP_ISLOCKED(tdvp));
+	KASSERT(tvp == NULL || VOP_ISLOCKED(tvp));
+
+	/* --- everything is now locked --- */
+
+	if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) ||
+	    (VTOI(tdvp)->i_flags & APPEND))) {
+		error = EPERM;
+		goto abort_withlocks;
+	}
+
+	/*
+	 * Check if just deleting a link name.
+	 */
+	if (fvp == tvp) {
+		if (fvp->v_type == VDIR) {
+			error = EINVAL;
+			goto abort_withlocks;
+		}
+
+		/* Release destination completely. Leave fdvp locked. */
+		VOP_ABORTOP(tdvp, tcnp);
+		if (fdvp != tdvp) {
+			VOP_UNLOCK(tdvp);
+		}
+		VOP_UNLOCK(tvp);
+		vrele(tdvp);
+		vrele(tvp);
+
+		/* Delete source. */
+		/* XXX: do we really need to relookup again? */
+
+		/*
+		 * fdvp is still locked, but we just unlocked fvp
+		 * (because fvp == tvp) so just decref fvp
+		 */
+		vrele(fvp);
+		fcnp->cn_flags &= ~(MODMASK);
+		fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+		fcnp->cn_nameiop = DELETE;
+		if ((error = relookup(fdvp, &fvp, fcnp, 0))) {
+			vput(fdvp);
+			return (error);
+		}
+		return (VOP_REMOVE(fdvp, fvp, fcnp));
+	}
+	fdp = VTOI(fdvp);
+	ip = VTOI(fvp);
+	if ((nlink_t) ip->i_nlink >= LINK_MAX) {
+		error = EMLINK;
+		goto abort_withlocks;
+	}
+	if ((ip->i_flags & (IMMUTABLE | APPEND)) ||
+		(fdp->i_flags & APPEND)) {
+		error = EPERM;
+		goto abort_withlocks;
+	}
+	if ((ip->i_mode & IFMT) == IFDIR) {
+		/*
+		 * Avoid ".", "..", and aliases of "." for obvious reasons.
+		 */
+		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
+		    fdp == ip ||
+		    (fcnp->cn_flags & ISDOTDOT) ||
+		    (tcnp->cn_flags & ISDOTDOT) ||
+		    (ip->i_flag & IN_RENAME)) {
+			error = EINVAL;
+			goto abort_withlocks;
+		}
+		ip->i_flag |= IN_RENAME;
+		doingdirectory = 1;
+	}
+	oldparent = fdp->i_number;
+	VN_KNOTE(fdvp, NOTE_WRITE);		/* XXXLUKEM/XXX: right place? */
+
+	/*
+	 * Both the directory
+	 * and target vnodes are locked.
+	 */
+	tdp = VTOI(tdvp);
+	txp = NULL;
+	if (tvp)
+		txp = VTOI(tvp);
+
+	mp = fdvp->v_mount;
+	fstrans_start(mp, FSTRANS_SHARED);
+
+	if (oldparent != tdp->i_number)
+		newparent = tdp->i_number;
+
+	/*
+	 * If ".." must be changed (ie the directory gets a new
+	 * parent) the user must have write permission in the source
+	 * so as to be able to change "..".
+	 */
+	if (doingdirectory && newparent) {
+		error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred);
+		if (error)
+			goto out;
+	}
+
+	KASSERT(fdvp != tvp);
+
+	if (newparent) {
+		/* Check for the rename("foo/foo", "foo") case. */
+		if (fdvp == tvp) {
+			error = doingdirectory ? ENOTEMPTY : EISDIR;
+			goto out;
+		}
+	}
+
+	fxp = VTOI(fvp);
+	fdp = VTOI(fdvp);
+
+	error = UFS_WAPBL_BEGIN(fdvp->v_mount);
+	if (error)
+		goto out2;
+
+	/*
+	 * 1) Bump link count while we're moving stuff
+	 *    around.  If we crash somewhere before
+	 *    completing our work, the link count
+	 *    may be wrong, but correctable.
+	 */
+	ip->i_nlink++;
+	DIP_ASSIGN(ip, nlink, ip->i_nlink);
+	ip->i_flag |= IN_CHANGE;
+	if ((error = UFS_UPDATE(fvp, NULL, NULL, UPDATE_DIROP)) != 0) {
+		goto bad;
+	}
+
+	/*
+	 * 2) If target doesn't exist, link the target
+	 *    to the source and unlink the source.
+	 *    Otherwise, rewrite the target directory
+	 *    entry to reference the source inode and
+	 *    expunge the original entry's existence.
+	 */
+	if (txp == NULL) {
+		if (tdp->i_dev != ip->i_dev)
+			panic("rename: EXDEV");
+		/*
+		 * Account for ".." in new directory.
+		 * When source and destination have the same
+		 * parent we don't fool with the link count.
+		 */
+		if (doingdirectory && newparent) {
+			if ((nlink_t)tdp->i_nlink >= LINK_MAX) {
+				error = EMLINK;
+				goto bad;
+			}
+			tdp->i_nlink++;
+			DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+			tdp->i_flag |= IN_CHANGE;
+			if ((error = UFS_UPDATE(tdvp, NULL, NULL,
+			    UPDATE_DIROP)) != 0) {
+				tdp->i_nlink--;
+				DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+				tdp->i_flag |= IN_CHANGE;
+				goto bad;
+			}
+		}
+		newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+		ufs_makedirentry(ip, tcnp, newdir);
+		error = ufs_direnter(tdvp, &to_ulr,
+				     NULL, newdir, tcnp, NULL);
+		pool_cache_put(ufs_direct_cache, newdir);
+		if (error != 0) {
+			if (doingdirectory && newparent) {
+				tdp->i_nlink--;
+				DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+				tdp->i_flag |= IN_CHANGE;
+				(void)UFS_UPDATE(tdvp, NULL, NULL,
+						 UPDATE_WAIT | UPDATE_DIROP);
+			}
+			goto bad;
+		}
+		VN_KNOTE(tdvp, NOTE_WRITE);
+	} else {
+		if (txp->i_dev != tdp->i_dev || txp->i_dev != ip->i_dev)
+			panic("rename: EXDEV");
+		/*
+		 * Short circuit rename(foo, foo).
+		 */
+		if (txp->i_number == ip->i_number)
+			panic("rename: same file");
+		/*
+		 * If the parent directory is "sticky", then the user must
+		 * own the parent directory, or the destination of the rename,
+		 * otherwise the destination may not be changed (except by
+		 * root). This implements append-only directories.
+		 */
+		if ((tdp->i_mode & S_ISTXT) &&
+		    kauth_authorize_generic(tcnp->cn_cred,
+		     KAUTH_GENERIC_ISSUSER, NULL) != 0 &&
+		    kauth_cred_geteuid(tcnp->cn_cred) != tdp->i_uid &&
+		    txp->i_uid != kauth_cred_geteuid(tcnp->cn_cred)) {
+			error = EPERM;
+			goto bad;
+		}
+		/*
+		 * Target must be empty if a directory and have no links
+		 * to it. Also, ensure source and target are compatible
+		 * (both directories, or both not directories).
+		 */
+		if ((txp->i_mode & IFMT) == IFDIR) {
+			if (txp->i_nlink > 2 ||
+			    !ufs_dirempty(txp, tdp->i_number, tcnp->cn_cred)) {
+				error = ENOTEMPTY;
+				goto bad;
+			}
+			if (!doingdirectory) {
+				error = ENOTDIR;
+				goto bad;
+			}
+			cache_purge(tdvp);
+		} else if (doingdirectory) {
+			error = EISDIR;
+			goto bad;
+		}
+		if ((error = ufs_dirrewrite(tdp, to_ulr.ulr_offset,
+		    txp, ip->i_number,
+		    IFTODT(ip->i_mode), doingdirectory && newparent ?
+		    newparent : doingdirectory, IN_CHANGE | IN_UPDATE)) != 0)
+			goto bad;
+		if (doingdirectory) {
+			/*
+			 * Truncate inode. The only stuff left in the directory
+			 * is "." and "..". The "." reference is inconsequential
+			 * since we are quashing it. We have removed the "."
+			 * reference and the reference in the parent directory,
+			 * but there may be other hard links.
+			 */
+			if (!newparent) {
+				tdp->i_nlink--;
+				DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+				tdp->i_flag |= IN_CHANGE;
+				UFS_WAPBL_UPDATE(tdvp, NULL, NULL, 0);
+			}
+			txp->i_nlink--;
+			DIP_ASSIGN(txp, nlink, txp->i_nlink);
+			txp->i_flag |= IN_CHANGE;
+			if ((error = UFS_TRUNCATE(tvp, (off_t)0, IO_SYNC,
+			    tcnp->cn_cred)))
+				goto bad;
+		}
+		VN_KNOTE(tdvp, NOTE_WRITE);
+		VN_KNOTE(tvp, NOTE_DELETE);
+	}
+
+	/*
+	 * Handle case where the directory entry we need to remove,
+	 * which is/was at from_ulr.ulr_offset, or the one before it,
+	 * which is/was at from_ulr.ulr_offset - from_ulr.ulr_count,
+	 * may have been moved when the directory insertion above
+	 * performed compaction.
+	 */
+	if (tdp->i_number == fdp->i_number &&
+	    ulr_overlap(&from_ulr, &to_ulr)) {
+
+		struct buf *bp;
+		struct direct *ep;
+		struct ufsmount *ump = fdp->i_ump;
+		doff_t curpos;
+		doff_t endsearch;	/* offset to end directory search */
+		uint32_t prev_reclen;
+		int dirblksiz = ump->um_dirblksiz;
+		const int needswap = UFS_MPNEEDSWAP(ump);
+		u_long bmask;
+		int namlen, entryoffsetinblock;
+		char *dirbuf;
+
+		bmask = fdvp->v_mount->mnt_stat.f_iosize - 1;
+
+		/*
+		 * The fcnp entry will be somewhere between the start of
+		 * compaction (to_ulr.ulr_offset) and the original location
+		 * (from_ulr.ulr_offset).
+		 */
+		curpos = to_ulr.ulr_offset;
+		endsearch = from_ulr.ulr_offset + from_ulr.ulr_reclen;
+		entryoffsetinblock = 0;
+
+		/*
+		 * Get the directory block containing the start of
+		 * compaction.
+		 */
+		error = ufs_blkatoff(fdvp, (off_t)to_ulr.ulr_offset, &dirbuf,
+		    &bp, false);
+		if (error)
+			goto bad;
+
+		/*
+		 * Keep existing ulr_count (length of previous record)
+		 * for the case where compaction did not include the
+		 * previous entry but started at the from-entry.
+		 */
+		prev_reclen = from_ulr.ulr_count;
+
+		while (curpos < endsearch) {
+			uint32_t reclen;
+
+			/*
+			 * If necessary, get the next directory block.
+			 *
+			 * dholland 7/13/11 to the best of my understanding
+			 * this should never happen; compaction occurs only
+			 * within single blocks. I think.
+			 */
+			if ((curpos & bmask) == 0) {
+				if (bp != NULL)
+					brelse(bp, 0);
+				error = ufs_blkatoff(fdvp, (off_t)curpos,
+				    &dirbuf, &bp, false);
+				if (error)
+					goto bad;
+				entryoffsetinblock = 0;
+			}
+
+			KASSERT(bp != NULL);
+			ep = (struct direct *)(dirbuf + entryoffsetinblock);
+			reclen = ufs_rw16(ep->d_reclen, needswap);
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+			if (FSFMT(fdvp) && needswap == 0)
+				namlen = ep->d_type;
+			else
+				namlen = ep->d_namlen;
+#else
+			if (FSFMT(fdvp) && needswap != 0)
+				namlen = ep->d_type;
+			else
+				namlen = ep->d_namlen;
+#endif
+			if ((ep->d_ino != 0) &&
+			    (ufs_rw32(ep->d_ino, needswap) != WINO) &&
+			    (namlen == fcnp->cn_namelen) &&
+			    memcmp(ep->d_name, fcnp->cn_nameptr, namlen) == 0) {
+				from_ulr.ulr_reclen = reclen;
+				break;
+			}
+			curpos += reclen;
+			entryoffsetinblock += reclen;
+			prev_reclen = reclen;
+		}
+
+		from_ulr.ulr_offset = curpos;
+		from_ulr.ulr_count = prev_reclen;
+
+		KASSERT(curpos <= endsearch);
+
+		/*
+		 * If ulr_offset points to start of a directory block,
+		 * clear ulr_count so ufs_dirremove() doesn't try to
+		 * merge free space over a directory block boundary.
+		 */
+		if ((from_ulr.ulr_offset & (dirblksiz - 1)) == 0)
+			from_ulr.ulr_count = 0;
+
+		brelse(bp, 0);
+	}
+
+	/*
+	 * 3) Unlink the source.
+	 */
+
+#if 0
+	/*
+	 * Ensure that the directory entry still exists and has not
+	 * changed while the new name has been entered. If the source is
+	 * a file then the entry may have been unlinked or renamed. In
+	 * either case there is no further work to be done. If the source
+	 * is a directory then it cannot have been rmdir'ed; The IRENAME
+	 * flag ensures that it cannot be moved by another rename or removed
+	 * by a rmdir.
+	 */
+#endif
+	KASSERT(fxp == ip);
+
+	/*
+	 * If the source is a directory with a new parent, the link
+	 * count of the old parent directory must be decremented and
+	 * ".." set to point to the new parent.
+	 */
+	if (doingdirectory && newparent) {
+		KASSERT(fdp != NULL);
+		ufs_dirrewrite(fxp, mastertemplate.dot_reclen,
+			       fdp, newparent, DT_DIR, 0, IN_CHANGE);
+		cache_purge(fdvp);
+	}
+	error = ufs_dirremove(fdvp, &from_ulr,
+			      fxp, fcnp->cn_flags, 0);
+	fxp->i_flag &= ~IN_RENAME;
+
+	VN_KNOTE(fvp, NOTE_RENAME);
+	goto done;
+
+ out:
+	goto out2;
+
+	/* exit routines from steps 1 & 2 */
+ bad:
+	if (doingdirectory)
+		ip->i_flag &= ~IN_RENAME;
+	ip->i_nlink--;
+	DIP_ASSIGN(ip, nlink, ip->i_nlink);
+	ip->i_flag |= IN_CHANGE;
+	ip->i_flag &= ~IN_RENAME;
+	UFS_WAPBL_UPDATE(fvp, NULL, NULL, 0);
+ done:
+	UFS_WAPBL_END(fdvp->v_mount);
+ out2:
+	/*
+	 * clear IN_RENAME - some exit paths happen too early to go
+	 * through the cleanup done in the "bad" case above, so we
+	 * always do this mini-cleanup here.
+	 */
+	ip->i_flag &= ~IN_RENAME;
+
+	VOP_UNLOCK(fdvp);
+	if (tdvp != fdvp) {
+		VOP_UNLOCK(tdvp);
+	}
+	VOP_UNLOCK(fvp);
+	if (tvp && tvp != fvp) {
+		VOP_UNLOCK(tvp);
+	}
+
+	vrele(fdvp);
+	vrele(tdvp);
+	vrele(fvp);
+	if (tvp) {
+		vrele(tvp);
+	}
+
+	fstrans_done(mp);
+	return (error);
+
+ abort_withlocks:
+	VOP_UNLOCK(fdvp);
+	if (tdvp != fdvp) {
+		VOP_UNLOCK(tdvp);
+	}
+	VOP_UNLOCK(fvp);
+	if (tvp && tvp != fvp) {
+		VOP_UNLOCK(tvp);
+	}
+
+ abort:
+	VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */
+	VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */
+	vrele(tdvp);
+	if (tvp) {
+		vrele(tvp);
+	}
+	vrele(fdvp);
+	if (fvp) {
+		vrele(fvp);
+	}
+	return (error);
+}
+
+int
+ufs_mkdir(void *v)
+{
+	struct vop_mkdir_args /* {
+		struct vnode		*a_dvp;
+		struct vnode		**a_vpp;
+		struct componentname	*a_cnp;
+		struct vattr		*a_vap;
+	} */ *ap = v;
+	struct vnode		*dvp = ap->a_dvp, *tvp;
+	struct vattr		*vap = ap->a_vap;
+	struct componentname	*cnp = ap->a_cnp;
+	struct inode		*ip, *dp = VTOI(dvp);
+	struct buf		*bp;
+	struct dirtemplate	dirtemplate;
+	struct direct		*newdir;
+	int			error, dmode;
+	struct ufsmount		*ump = dp->i_ump;
+	int			dirblksiz = ump->um_dirblksiz;
+	struct ufs_lookup_results *ulr;
+
+	fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+
+	/* XXX should handle this material another way */
+	ulr = &dp->i_crap;
+	UFS_CHECK_CRAPCOUNTER(dp);
+
+	if ((nlink_t)dp->i_nlink >= LINK_MAX) {
+		error = EMLINK;
+		goto out;
+	}
+	dmode = vap->va_mode & ACCESSPERMS;
+	dmode |= IFDIR;
+	/*
+	 * Must simulate part of ufs_makeinode here to acquire the inode,
+	 * but not have it entered in the parent directory. The entry is
+	 * made later after writing "." and ".." entries.
+	 */
+	if ((error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, ap->a_vpp)) != 0)
+		goto out;
+
+	tvp = *ap->a_vpp;
+	ip = VTOI(tvp);
+
+	error = UFS_WAPBL_BEGIN(ap->a_dvp->v_mount);
+	if (error) {
+		UFS_VFREE(tvp, ip->i_number, dmode);
+		vput(tvp);
+		goto out;
+	}
+	ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
+	DIP_ASSIGN(ip, uid, ip->i_uid);
+	ip->i_gid = dp->i_gid;
+	DIP_ASSIGN(ip, gid, ip->i_gid);
+#if defined(QUOTA) || defined(QUOTA2)
+	if ((error = chkiq(ip, 1, cnp->cn_cred, 0))) {
+		UFS_VFREE(tvp, ip->i_number, dmode);
+		UFS_WAPBL_END(dvp->v_mount);
+		fstrans_done(dvp->v_mount);
+		vput(tvp);
+		vput(dvp);
+		return (error);
+	}
+#endif
+	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+	ip->i_mode = dmode;
+	DIP_ASSIGN(ip, mode, dmode);
+	tvp->v_type = VDIR;	/* Rest init'd in getnewvnode(). */
+	ip->i_nlink = 2;
+	DIP_ASSIGN(ip, nlink, 2);
+	if (cnp->cn_flags & ISWHITEOUT) {
+		ip->i_flags |= UF_OPAQUE;
+		DIP_ASSIGN(ip, flags, ip->i_flags);
+	}
+
+	/*
+	 * Bump link count in parent directory to reflect work done below.
+	 * Should be done before reference is created so cleanup is
+	 * possible if we crash.
+	 */
+	dp->i_nlink++;
+	DIP_ASSIGN(dp, nlink, dp->i_nlink);
+	dp->i_flag |= IN_CHANGE;
+	if ((error = UFS_UPDATE(dvp, NULL, NULL, UPDATE_DIROP)) != 0)
+		goto bad;
+
+	/*
+	 * Initialize directory with "." and ".." from static template.
+	 */
+	dirtemplate = mastertemplate;
+	dirtemplate.dotdot_reclen = dirblksiz - dirtemplate.dot_reclen;
+	dirtemplate.dot_ino = ufs_rw32(ip->i_number, UFS_MPNEEDSWAP(ump));
+	dirtemplate.dotdot_ino = ufs_rw32(dp->i_number, UFS_MPNEEDSWAP(ump));
+	dirtemplate.dot_reclen = ufs_rw16(dirtemplate.dot_reclen,
+	    UFS_MPNEEDSWAP(ump));
+	dirtemplate.dotdot_reclen = ufs_rw16(dirtemplate.dotdot_reclen,
+	    UFS_MPNEEDSWAP(ump));
+	if (ump->um_maxsymlinklen <= 0) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+		if (UFS_MPNEEDSWAP(ump) == 0)
+#else
+		if (UFS_MPNEEDSWAP(ump) != 0)
+#endif
+		{
+			dirtemplate.dot_type = dirtemplate.dot_namlen;
+			dirtemplate.dotdot_type = dirtemplate.dotdot_namlen;
+			dirtemplate.dot_namlen = dirtemplate.dotdot_namlen = 0;
+		} else
+			dirtemplate.dot_type = dirtemplate.dotdot_type = 0;
+	}
+	if ((error = UFS_BALLOC(tvp, (off_t)0, dirblksiz, cnp->cn_cred,
+	    B_CLRBUF, &bp)) != 0)
+		goto bad;
+	ip->i_size = dirblksiz;
+	DIP_ASSIGN(ip, size, dirblksiz);
+	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+	uvm_vnp_setsize(tvp, ip->i_size);
+	memcpy((void *)bp->b_data, (void *)&dirtemplate, sizeof dirtemplate);
+
+	/*
+	 * Directory set up, now install it's entry in the parent directory.
+	 * We must write out the buffer containing the new directory body
+	 * before entering the new name in the parent.
+	 */
+	if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0)
+		goto bad;
+	if ((error = UFS_UPDATE(tvp, NULL, NULL, UPDATE_DIROP)) != 0) {
+		goto bad;
+	}
+	newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+	ufs_makedirentry(ip, cnp, newdir);
+	error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, bp);
+	pool_cache_put(ufs_direct_cache, newdir);
+ bad:
+	if (error == 0) {
+		VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+		UFS_WAPBL_END(dvp->v_mount);
+	} else {
+		dp->i_nlink--;
+		DIP_ASSIGN(dp, nlink, dp->i_nlink);
+		dp->i_flag |= IN_CHANGE;
+		UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
+		/*
+		 * No need to do an explicit UFS_TRUNCATE here, vrele will
+		 * do this for us because we set the link count to 0.
+		 */
+		ip->i_nlink = 0;
+		DIP_ASSIGN(ip, nlink, 0);
+		ip->i_flag |= IN_CHANGE;
+		/* If IN_ADIROP, account for it */
+		UFS_UNMARK_VNODE(tvp);
+		UFS_WAPBL_UPDATE(tvp, NULL, NULL, UPDATE_DIROP);
+		UFS_WAPBL_END(dvp->v_mount);
+		vput(tvp);
+	}
+ out:
+	fstrans_done(dvp->v_mount);
+	vput(dvp);
+	return (error);
+}
+
+int
+ufs_rmdir(void *v)
+{
+	struct vop_rmdir_args /* {
+		struct vnode		*a_dvp;
+		struct vnode		*a_vp;
+		struct componentname	*a_cnp;
+	} */ *ap = v;
+	struct vnode		*vp, *dvp;
+	struct componentname	*cnp;
+	struct inode		*ip, *dp;
+	int			error;
+	struct ufs_lookup_results *ulr;
+
+	vp = ap->a_vp;
+	dvp = ap->a_dvp;
+	cnp = ap->a_cnp;
+	ip = VTOI(vp);
+	dp = VTOI(dvp);
+
+	/* XXX should handle this material another way */
+	ulr = &dp->i_crap;
+	UFS_CHECK_CRAPCOUNTER(dp);
+
+	/*
+	 * No rmdir "." or of mounted directories please.
+	 */
+	if (dp == ip || vp->v_mountedhere != NULL) {
+		if (dp == ip)
+			vrele(dvp);
+		else
+			vput(dvp);
+		vput(vp);
+		return (EINVAL);
+	}
+
+	fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+
+	/*
+	 * Do not remove a directory that is in the process of being renamed.
+	 * Verify that the directory is empty (and valid). (Rmdir ".." won't
+	 * be valid since ".." will contain a reference to the current
+	 * directory and thus be non-empty.)
+	 */
+	error = 0;
+	if (ip->i_flag & IN_RENAME) {
+		error = EINVAL;
+		goto out;
+	}
+	if (ip->i_nlink != 2 ||
+	    !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
+		error = ENOTEMPTY;
+		goto out;
+	}
+	if ((dp->i_flags & APPEND) ||
+		(ip->i_flags & (IMMUTABLE | APPEND))) {
+		error = EPERM;
+		goto out;
+	}
+	error = UFS_WAPBL_BEGIN(dvp->v_mount);
+	if (error)
+		goto out;
+	/*
+	 * Delete reference to directory before purging
+	 * inode.  If we crash in between, the directory
+	 * will be reattached to lost+found,
+	 */
+	error = ufs_dirremove(dvp, ulr, ip, cnp->cn_flags, 1);
+	if (error) {
+		UFS_WAPBL_END(dvp->v_mount);
+		goto out;
+	}
+	VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+	cache_purge(dvp);
+	/*
+	 * Truncate inode.  The only stuff left in the directory is "." and
+	 * "..".  The "." reference is inconsequential since we're quashing
+	 * it.
+	 */
+	dp->i_nlink--;
+	DIP_ASSIGN(dp, nlink, dp->i_nlink);
+	dp->i_flag |= IN_CHANGE;
+	UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
+	ip->i_nlink--;
+	DIP_ASSIGN(ip, nlink, ip->i_nlink);
+	ip->i_flag |= IN_CHANGE;
+	error = UFS_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred);
+	cache_purge(vp);
+	/*
+	 * Unlock the log while we still have reference to unlinked
+	 * directory vp so that it will not get locked for recycling
+	 */
+	UFS_WAPBL_END(dvp->v_mount);
+#ifdef UFS_DIRHASH
+	if (ip->i_dirhash != NULL)
+		ufsdirhash_free(ip);
+#endif
+ out:
+	VN_KNOTE(vp, NOTE_DELETE);
+	vput(vp);
+	fstrans_done(dvp->v_mount);
+	vput(dvp);
+	return (error);
+}
+
+/*
+ * symlink -- make a symbolic link
+ */
+int
+ufs_symlink(void *v)
+{
+	struct vop_symlink_args /* {
+		struct vnode		*a_dvp;
+		struct vnode		**a_vpp;
+		struct componentname	*a_cnp;
+		struct vattr		*a_vap;
+		char			*a_target;
+	} */ *ap = v;
+	struct vnode	*vp, **vpp;
+	struct inode	*ip;
+	int		len, error;
+	struct ufs_lookup_results *ulr;
+
+	vpp = ap->a_vpp;
+
+	/* XXX should handle this material another way */
+	ulr = &VTOI(ap->a_dvp)->i_crap;
+	UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));
+
+	/*
+	 * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
+	 * ufs_makeinode
+	 */
+	fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED);
+	error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, ulr,
+			      vpp, ap->a_cnp);
+	if (error)
+		goto out;
+	VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+	vp = *vpp;
+	len = strlen(ap->a_target);
+	ip = VTOI(vp);
+	if (len < ip->i_ump->um_maxsymlinklen) {
+		memcpy((char *)SHORTLINK(ip), ap->a_target, len);
+		ip->i_size = len;
+		DIP_ASSIGN(ip, size, len);
+		uvm_vnp_setsize(vp, ip->i_size);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		if (vp->v_mount->mnt_flag & MNT_RELATIME)
+			ip->i_flag |= IN_ACCESS;
+		UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+	} else
+		error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0,
+		    UIO_SYSSPACE, IO_NODELOCKED | IO_JOURNALLOCKED,
+		    ap->a_cnp->cn_cred, NULL, NULL);
+	UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp);
+	if (error)
+		vput(vp);
+out:
+	fstrans_done(ap->a_dvp->v_mount);
+	return (error);
+}
+
+/*
+ * Vnode op for reading directories.
+ *
+ * This routine handles converting from the on-disk directory format
+ * "struct direct" to the in-memory format "struct dirent" as well as
+ * byte swapping the entries if necessary.
+ */
+int
+ufs_readdir(void *v)
+{
+	struct vop_readdir_args /* {
+		struct vnode	*a_vp;
+		struct uio	*a_uio;
+		kauth_cred_t	a_cred;
+		int		*a_eofflag;
+		off_t		**a_cookies;
+		int		*ncookies;
+	} */ *ap = v;
+	struct vnode	*vp = ap->a_vp;
+	struct direct	*cdp, *ecdp;
+	struct dirent	*ndp;
+	char		*cdbuf, *ndbuf, *endp;
+	struct uio	auio, *uio;
+	struct iovec	aiov;
+	int		error;
+	size_t		count, ccount, rcount;
+	off_t		off, *ccp;
+	off_t		startoff;
+	size_t		skipbytes;
+	struct ufsmount	*ump = VFSTOUFS(vp->v_mount);
+	int nswap = UFS_MPNEEDSWAP(ump);
+#if BYTE_ORDER == LITTLE_ENDIAN
+	int needswap = ump->um_maxsymlinklen <= 0 && nswap == 0;
+#else
+	int needswap = ump->um_maxsymlinklen <= 0 && nswap != 0;
+#endif
+	uio = ap->a_uio;
+	count = uio->uio_resid;
+	rcount = count - ((uio->uio_offset + count) & (ump->um_dirblksiz - 1));
+
+	if (rcount < _DIRENT_MINSIZE(cdp) || count < _DIRENT_MINSIZE(ndp))
+		return EINVAL;
+
+	startoff = uio->uio_offset & ~(ump->um_dirblksiz - 1);
+	skipbytes = uio->uio_offset - startoff;
+	rcount += skipbytes;
+
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = startoff;
+	auio.uio_resid = rcount;
+	UIO_SETUP_SYSSPACE(&auio);
+	auio.uio_rw = UIO_READ;
+	cdbuf = malloc(rcount, M_TEMP, M_WAITOK);
+	aiov.iov_base = cdbuf;
+	aiov.iov_len = rcount;
+	error = VOP_READ(vp, &auio, 0, ap->a_cred);
+	if (error != 0) {
+		free(cdbuf, M_TEMP);
+		return error;
+	}
+
+	rcount -= auio.uio_resid;
+
+	cdp = (struct direct *)(void *)cdbuf;
+	ecdp = (struct direct *)(void *)&cdbuf[rcount];
+
+	ndbuf = malloc(count, M_TEMP, M_WAITOK);
+	ndp = (struct dirent *)(void *)ndbuf;
+	endp = &ndbuf[count];
+
+	off = uio->uio_offset;
+	if (ap->a_cookies) {
+		ccount = rcount / _DIRENT_RECLEN(cdp, 1);
+		ccp = *(ap->a_cookies) = malloc(ccount * sizeof(*ccp),
+		    M_TEMP, M_WAITOK);
+	} else {
+		/* XXX: GCC */
+		ccount = 0;
+		ccp = NULL;
+	}
+
+	while (cdp < ecdp) {
+		cdp->d_reclen = ufs_rw16(cdp->d_reclen, nswap);
+		if (skipbytes > 0) {
+			if (cdp->d_reclen <= skipbytes) {
+				skipbytes -= cdp->d_reclen;
+				cdp = _DIRENT_NEXT(cdp);
+				continue;
+			}
+			/*
+			 * invalid cookie.
+			 */
+			error = EINVAL;
+			goto out;
+		}
+		if (cdp->d_reclen == 0) {
+			struct dirent *ondp = ndp;
+			ndp->d_reclen = _DIRENT_MINSIZE(ndp);
+			ndp = _DIRENT_NEXT(ndp);
+			ondp->d_reclen = 0;
+			cdp = ecdp;
+			break;
+		}
+		if (needswap) {
+			ndp->d_type = cdp->d_namlen;
+			ndp->d_namlen = cdp->d_type;
+		} else {
+			ndp->d_type = cdp->d_type;
+			ndp->d_namlen = cdp->d_namlen;
+		}
+		ndp->d_reclen = _DIRENT_RECLEN(ndp, ndp->d_namlen);
+		if ((char *)(void *)ndp + ndp->d_reclen +
+		    _DIRENT_MINSIZE(ndp) > endp)
+			break;
+		ndp->d_fileno = ufs_rw32(cdp->d_ino, nswap);
+		(void)memcpy(ndp->d_name, cdp->d_name, ndp->d_namlen);
+		memset(&ndp->d_name[ndp->d_namlen], 0,
+		    ndp->d_reclen - _DIRENT_NAMEOFF(ndp) - ndp->d_namlen);
+		off += cdp->d_reclen;
+		if (ap->a_cookies) {
+			KASSERT(ccp - *(ap->a_cookies) < ccount);
+			*(ccp++) = off;
+		}
+		ndp = _DIRENT_NEXT(ndp);
+		cdp = _DIRENT_NEXT(cdp);
+	}
+
+	count = ((char *)(void *)ndp - ndbuf);
+	error = uiomove(ndbuf, count, uio);
+out:
+	if (ap->a_cookies) {
+		if (error) {
+			free(*(ap->a_cookies), M_TEMP);
+			*(ap->a_cookies) = NULL;
+			*(ap->a_ncookies) = 0;
+		} else {
+			*ap->a_ncookies = ccp - *(ap->a_cookies);
+		}
+	}
+	uio->uio_offset = off;
+	free(ndbuf, M_TEMP);
+	free(cdbuf, M_TEMP);
+	*ap->a_eofflag = VTOI(vp)->i_size <= uio->uio_offset;
+	return error;
+}
+
+/*
+ * Return target name of a symbolic link
+ */
+int
+ufs_readlink(void *v)
+{
+	struct vop_readlink_args /* {
+		struct vnode	*a_vp;
+		struct uio	*a_uio;
+		kauth_cred_t	a_cred;
+	} */ *ap = v;
+	struct vnode	*vp = ap->a_vp;
+	struct inode	*ip = VTOI(vp);
+	struct ufsmount	*ump = VFSTOUFS(vp->v_mount);
+	int		isize;
+
+	isize = ip->i_size;
+	if (isize < ump->um_maxsymlinklen ||
+	    (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0)) {
+		uiomove((char *)SHORTLINK(ip), isize, ap->a_uio);
+		return (0);
+	}
+	return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
+}
+
+/*
+ * Calculate the logical to physical mapping if not done already,
+ * then call the device strategy routine.
+ */
+int
+ufs_strategy(void *v)
+{
+	struct vop_strategy_args /* {
+		struct vnode *a_vp;
+		struct buf *a_bp;
+	} */ *ap = v;
+	struct buf	*bp;
+	struct vnode	*vp;
+	struct inode	*ip;
+	struct mount	*mp;
+	int		error;
+
+	bp = ap->a_bp;
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		panic("ufs_strategy: spec");
+	KASSERT(bp->b_bcount != 0);
+	if (bp->b_blkno == bp->b_lblkno) {
+		error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
+				 NULL);
+		if (error) {
+			bp->b_error = error;
+			biodone(bp);
+			return (error);
+		}
+		if (bp->b_blkno == -1) /* no valid data */
+			clrbuf(bp);
+	}
+	if (bp->b_blkno < 0) { /* block is not on disk */
+		biodone(bp);
+		return (0);
+	}
+	vp = ip->i_devvp;
+
+	error = VOP_STRATEGY(vp, bp);
+	if (error)
+		return error;
+
+	if (!BUF_ISREAD(bp))
+		return 0;
+
+	mp = wapbl_vptomp(vp);
+	if (mp == NULL || mp->mnt_wapbl_replay == NULL ||
+	    !WAPBL_REPLAY_ISOPEN(mp) ||
+	    !WAPBL_REPLAY_CAN_READ(mp, bp->b_blkno, bp->b_bcount))
+		return 0;
+
+	error = biowait(bp);
+	if (error)
+		return error;
+
+	error = WAPBL_REPLAY_READ(mp, bp->b_data, bp->b_blkno, bp->b_bcount);
+	if (error) {
+		mutex_enter(&bufcache_lock);
+		SET(bp->b_cflags, BC_INVAL);
+		mutex_exit(&bufcache_lock);
+	}
+	return error;
+}
+
+/*
+ * Print out the contents of an inode.
+ */
+int
+ufs_print(void *v)
+{
+	struct vop_print_args /* {
+		struct vnode	*a_vp;
+	} */ *ap = v;
+	struct vnode	*vp;
+	struct inode	*ip;
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	printf("tag VT_UFS, ino %llu, on dev %llu, %llu",
+	    (unsigned long long)ip->i_number,
+	    (unsigned long long)major(ip->i_dev),
+	    (unsigned long long)minor(ip->i_dev));
+	printf(" flags 0x%x, nlink %d\n",
+	    ip->i_flag, ip->i_nlink);
+	printf("\tmode 0%o, owner %d, group %d, size %qd",
+	    ip->i_mode, ip->i_uid, ip->i_gid,
+	    (long long)ip->i_size);
+	if (vp->v_type == VFIFO)
+		VOCALL(fifo_vnodeop_p, VOFFSET(vop_print), v);
+	printf("\n");
+	return (0);
+}
+
+/*
+ * Read wrapper for special devices.
+ */
+int
+ufsspec_read(void *v)
+{
+	struct vop_read_args /* {
+		struct vnode	*a_vp;
+		struct uio	*a_uio;
+		int		a_ioflag;
+		kauth_cred_t	a_cred;
+	} */ *ap = v;
+
+	/*
+	 * Set access flag.
+	 */
+	if ((ap->a_vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)
+		VTOI(ap->a_vp)->i_flag |= IN_ACCESS;
+	return (VOCALL (spec_vnodeop_p, VOFFSET(vop_read), ap));
+}
+
+/*
+ * Write wrapper for special devices.
+ */
+int
+ufsspec_write(void *v)
+{
+	struct vop_write_args /* {
+		struct vnode	*a_vp;
+		struct uio	*a_uio;
+		int		a_ioflag;
+		kauth_cred_t	a_cred;
+	} */ *ap = v;
+
+	/*
+	 * Set update and change flags.
+	 */
+	if ((ap->a_vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)
+		VTOI(ap->a_vp)->i_flag |= IN_MODIFY;
+	return (VOCALL (spec_vnodeop_p, VOFFSET(vop_write), ap));
+}
+
+/*
+ * Close wrapper for special devices.
+ *
+ * Update the times on the inode then do device close.
+ */
+int
+ufsspec_close(void *v)
+{
+	struct vop_close_args /* {
+		struct vnode	*a_vp;
+		int		a_fflag;
+		kauth_cred_t	a_cred;
+	} */ *ap = v;
+	struct vnode	*vp;
+	struct inode	*ip;
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	if (vp->v_usecount > 1)
+		UFS_ITIMES(vp, NULL, NULL, NULL);
+	return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap));
+}
+
+/*
+ * Read wrapper for fifo's
+ */
+int
+ufsfifo_read(void *v)
+{
+	struct vop_read_args /* {
+		struct vnode	*a_vp;
+		struct uio	*a_uio;
+		int		a_ioflag;
+		kauth_cred_t	a_cred;
+	} */ *ap = v;
+
+	/*
+	 * Set access flag.
+	 */
+	VTOI(ap->a_vp)->i_flag |= IN_ACCESS;
+	return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_read), ap));
+}
+
+/*
+ * Write wrapper for fifo's.
+ */
+int
+ufsfifo_write(void *v)
+{
+	struct vop_write_args /* {
+		struct vnode	*a_vp;
+		struct uio	*a_uio;
+		int		a_ioflag;
+		kauth_cred_t	a_cred;
+	} */ *ap = v;
+
+	/*
+	 * Set update and change flags.
+	 */
+	VTOI(ap->a_vp)->i_flag |= IN_MODIFY;
+	return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_write), ap));
+}
+
+/*
+ * Close wrapper for fifo's.
+ *
+ * Update the times on the inode then do device close.
+ */
+int
+ufsfifo_close(void *v)
+{
+	struct vop_close_args /* {
+		struct vnode	*a_vp;
+		int		a_fflag;
+		kauth_cred_t	a_cred;
+	} */ *ap = v;
+	struct vnode	*vp;
+	struct inode	*ip;
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	if (ap->a_vp->v_usecount > 1)
+		UFS_ITIMES(vp, NULL, NULL, NULL);
+	return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap));
+}
+
+/*
+ * Return POSIX pathconf information applicable to ufs filesystems.
+ */
+int
+ufs_pathconf(void *v)
+{
+	struct vop_pathconf_args /* {
+		struct vnode	*a_vp;
+		int		a_name;
+		register_t	*a_retval;
+	} */ *ap = v;
+
+	switch (ap->a_name) {
+	case _PC_LINK_MAX:
+		*ap->a_retval = LINK_MAX;
+		return (0);
+	case _PC_NAME_MAX:
+		*ap->a_retval = FFS_MAXNAMLEN;
+		return (0);
+	case _PC_PATH_MAX:
+		*ap->a_retval = PATH_MAX;
+		return (0);
+	case _PC_PIPE_BUF:
+		*ap->a_retval = PIPE_BUF;
+		return (0);
+	case _PC_CHOWN_RESTRICTED:
+		*ap->a_retval = 1;
+		return (0);
+	case _PC_NO_TRUNC:
+		*ap->a_retval = 1;
+		return (0);
+	case _PC_SYNC_IO:
+		*ap->a_retval = 1;
+		return (0);
+	case _PC_FILESIZEBITS:
+		*ap->a_retval = 42;
+		return (0);
+	case _PC_SYMLINK_MAX:
+		*ap->a_retval = MAXPATHLEN;
+		return (0);
+	case _PC_2_SYMLINKS:
+		*ap->a_retval = 1;
+		return (0);
+	default:
+		return (EINVAL);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Advisory record locking support
+ */
+int
+ufs_advlock(void *v)
+{
+	struct vop_advlock_args /* {
+		struct vnode	*a_vp;
+		void *		a_id;
+		int		a_op;
+		struct flock	*a_fl;
+		int		a_flags;
+	} */ *ap = v;
+	struct inode *ip;
+
+	ip = VTOI(ap->a_vp);
+	return lf_advlock(ap, &ip->i_lockf, ip->i_size);
+}
+
+/*
+ * Initialize the vnode associated with a new inode, handle aliased
+ * vnodes.
+ */
+void
+ufs_vinit(struct mount *mntp, int (**specops)(void *), int (**fifoops)(void *),
+	struct vnode **vpp)
+{
+	struct timeval	tv;
+	struct inode	*ip;
+	struct vnode	*vp;
+	dev_t		rdev;
+	struct ufsmount	*ump;
+
+	vp = *vpp;
+	ip = VTOI(vp);
+	switch(vp->v_type = IFTOVT(ip->i_mode)) {
+	case VCHR:
+	case VBLK:
+		vp->v_op = specops;
+		ump = ip->i_ump;
+		if (ump->um_fstype == UFS1)
+			rdev = (dev_t)ufs_rw32(ip->i_ffs1_rdev,
+			    UFS_MPNEEDSWAP(ump));
+		else
+			rdev = (dev_t)ufs_rw64(ip->i_ffs2_rdev,
+			    UFS_MPNEEDSWAP(ump));
+		spec_node_init(vp, rdev);
+		break;
+	case VFIFO:
+		vp->v_op = fifoops;
+		break;
+	case VNON:
+	case VBAD:
+	case VSOCK:
+	case VLNK:
+	case VDIR:
+	case VREG:
+		break;
+	}
+	if (ip->i_number == ROOTINO)
+                vp->v_vflag |= VV_ROOT;
+	/*
+	 * Initialize modrev times
+	 */
+	getmicrouptime(&tv);
+	ip->i_modrev = (uint64_t)(uint)tv.tv_sec << 32
+			| tv.tv_usec * 4294u;
+	*vpp = vp;
+}
+
+/*
+ * Allocate a new inode.
+ */
+int
+ufs_makeinode(int mode, struct vnode *dvp, const struct ufs_lookup_results *ulr,
+	struct vnode **vpp, struct componentname *cnp)
+{
+	struct inode	*ip, *pdir;
+	struct direct	*newdir;
+	struct vnode	*tvp;
+	int		error, ismember = 0;
+
+	UFS_WAPBL_JUNLOCK_ASSERT(dvp->v_mount);
+
+	pdir = VTOI(dvp);
+
+	if ((mode & IFMT) == 0)
+		mode |= IFREG;
+
+	if ((error = UFS_VALLOC(dvp, mode, cnp->cn_cred, vpp)) != 0) {
+		vput(dvp);
+		return (error);
+	}
+	tvp = *vpp;
+	ip = VTOI(tvp);
+	ip->i_gid = pdir->i_gid;
+	DIP_ASSIGN(ip, gid, ip->i_gid);
+	ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
+	DIP_ASSIGN(ip, uid, ip->i_uid);
+	error = UFS_WAPBL_BEGIN1(dvp->v_mount, dvp);
+	if (error) {
+		/*
+		 * Note, we can't VOP_VFREE(tvp) here like we should
+		 * because we can't write to the disk.  Instead, we leave
+		 * the vnode dangling from the journal.
+		 */
+		vput(tvp);
+		vput(dvp);
+		return (error);
+	}
+#if defined(QUOTA) || defined(QUOTA2)
+	if ((error = chkiq(ip, 1, cnp->cn_cred, 0))) {
+		UFS_VFREE(tvp, ip->i_number, mode);
+		UFS_WAPBL_END1(dvp->v_mount, dvp);
+		vput(tvp);
+		vput(dvp);
+		return (error);
+	}
+#endif
+	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+	ip->i_mode = mode;
+	DIP_ASSIGN(ip, mode, mode);
+	tvp->v_type = IFTOVT(mode);	/* Rest init'd in getnewvnode(). */
+	ip->i_nlink = 1;
+	DIP_ASSIGN(ip, nlink, 1);
+	if ((ip->i_mode & ISGID) && (kauth_cred_ismember_gid(cnp->cn_cred,
+	    ip->i_gid, &ismember) != 0 || !ismember) &&
+	    kauth_authorize_generic(cnp->cn_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
+		ip->i_mode &= ~ISGID;
+		DIP_ASSIGN(ip, mode, ip->i_mode);
+	}
+
+	if (cnp->cn_flags & ISWHITEOUT) {
+		ip->i_flags |= UF_OPAQUE;
+		DIP_ASSIGN(ip, flags, ip->i_flags);
+	}
+
+	/*
+	 * Make sure inode goes to disk before directory entry.
+	 */
+	if ((error = UFS_UPDATE(tvp, NULL, NULL, UPDATE_DIROP)) != 0)
+		goto bad;
+	newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+	ufs_makedirentry(ip, cnp, newdir);
+	error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, NULL);
+	pool_cache_put(ufs_direct_cache, newdir);
+	if (error)
+		goto bad;
+	vput(dvp);
+	*vpp = tvp;
+	return (0);
+
+ bad:
+	/*
+	 * Write error occurred trying to update the inode
+	 * or the directory so must deallocate the inode.
+	 */
+	ip->i_nlink = 0;
+	DIP_ASSIGN(ip, nlink, 0);
+	ip->i_flag |= IN_CHANGE;
+	/* If IN_ADIROP, account for it */
+	UFS_UNMARK_VNODE(tvp);
+	UFS_WAPBL_UPDATE(tvp, NULL, NULL, 0);
+	tvp->v_type = VNON;		/* explodes later if VBLK */
+	UFS_WAPBL_END1(dvp->v_mount, dvp);
+	vput(tvp);
+	vput(dvp);
+	return (error);
+}
+
+/*
+ * Allocate len bytes at offset off.
+ */
+int
+ufs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags,
+    kauth_cred_t cred)
+{
+        struct inode *ip = VTOI(vp);
+        int error, delta, bshift, bsize;
+        UVMHIST_FUNC("ufs_gop_alloc"); UVMHIST_CALLED(ubchist);
+
+        error = 0;
+        bshift = vp->v_mount->mnt_fs_bshift;
+        bsize = 1 << bshift;
+
+        delta = off & (bsize - 1);
+        off -= delta;
+        len += delta;
+
+        while (len > 0) {
+                bsize = MIN(bsize, len);
+
+                error = UFS_BALLOC(vp, off, bsize, cred, flags, NULL);
+                if (error) {
+                        goto out;
+                }
+
+                /*
+                 * increase file size now, UFS_BALLOC() requires that
+                 * EOF be up-to-date before each call.
+                 */
+
+                if (ip->i_size < off + bsize) {
+                        UVMHIST_LOG(ubchist, "vp %p old 0x%x new 0x%x",
+                            vp, ip->i_size, off + bsize, 0);
+                        ip->i_size = off + bsize;
+			DIP_ASSIGN(ip, size, ip->i_size);
+                }
+
+                off += bsize;
+                len -= bsize;
+        }
+
+out:
+	UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+	return error;
+}
+
+void
+ufs_gop_markupdate(struct vnode *vp, int flags)
+{
+	u_int32_t mask = 0;
+
+	if ((flags & GOP_UPDATE_ACCESSED) != 0) {
+		mask = IN_ACCESS;
+	}
+	if ((flags & GOP_UPDATE_MODIFIED) != 0) {
+		if (vp->v_type == VREG) {
+			mask |= IN_CHANGE | IN_UPDATE;
+		} else {
+			mask |= IN_MODIFY;
+		}
+	}
+	if (mask) {
+		struct inode *ip = VTOI(vp);
+
+		ip->i_flag |= mask;
+	}
+}
diff --git a/sys/ufs/ufs/ufs_wapbl.c b/sys/ufs/ufs/ufs_wapbl.c
new file mode 100644
index 000000000..1f11526e7
--- /dev/null
+++ b/sys/ufs/ufs/ufs_wapbl.c
@@ -0,0 +1,166 @@
+/*  $NetBSD: ufs_wapbl.c,v 1.22 2011/07/18 06:46:05 dholland Exp $ */
+
+/*-
+ * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_vnops.c	8.28 (Berkeley) 7/31/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_wapbl.c,v 1.22 2011/07/18 06:46:05 dholland Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/fifofs/fifo.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+
+#ifdef WAPBL_DEBUG_INODES
+#error WAPBL_DEBUG_INODES: not functional before ufs_wapbl.c is updated
+void
+ufs_wapbl_verify_inodes(struct mount *mp, const char *str)
+{
+	struct vnode *vp, *nvp;
+	struct inode *ip;
+	struct buf *bp, *nbp;
+
+	mutex_enter(&mntvnode_lock);
+ loop:
+	TAILQ_FOREACH_REVERSE(vp, &mp->mnt_vnodelist, vnodelst, v_mntvnodes) {
+		/*
+		 * If the vnode that we are about to sync is no longer
+		 * associated with this mount point, start over.
+		 */
+		if (vp->v_mount != mp)
+			goto loop;
+		mutex_enter(&vp->v_interlock);
+		nvp = TAILQ_NEXT(vp, v_mntvnodes);
+		ip = VTOI(vp);
+		if (vp->v_type == VNON) {
+			mutex_exit(&vp->v_interlock);
+			continue;
+		}
+		/* verify that update has been called on all inodes */
+		if (ip->i_flag & (IN_CHANGE | IN_UPDATE)) {
+			panic("wapbl_verify: mp %p: dirty vnode %p (inode %p): 0x%x\n",
+				mp, vp, ip, ip->i_flag);
+		}
+		mutex_exit(&mntvnode_lock);
+
+		mutex_enter(&bufcache_lock);
+		for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+			nbp = LIST_NEXT(bp, b_vnbufs);
+			if ((bp->b_cflags & BC_BUSY)) {
+				continue;
+			}
+			KASSERT((bp->b_oflags & BO_DELWRI) != 0);
+			KASSERT((bp->b_flags & B_LOCKED) != 0);
+		}
+		mutex_exit(&bufcache_lock);
+		mutex_exit(&vp->v_interlock);
+
+		mutex_enter(&mntvnode_lock);
+	}
+	mutex_exit(&mntvnode_lock);
+
+	vp = VFSTOUFS(mp)->um_devvp;
+	mutex_enter(&vp->v_interlock);
+	mutex_enter(&bufcache_lock);
+	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+		nbp = LIST_NEXT(bp, b_vnbufs);
+		if ((bp->b_cflags & BC_BUSY)) {
+			continue;
+		}
+		KASSERT((bp->b_oflags & BO_DELWRI) != 0);
+		KASSERT((bp->b_flags & B_LOCKED) != 0);
+	}
+	mutex_exit(&bufcache_lock);
+	mutex_exit(&vp->v_interlock);
+}
+#endif /* WAPBL_DEBUG_INODES */
diff --git a/include/ufs/ufs/ufs_wapbl.h b/sys/ufs/ufs/ufs_wapbl.h
similarity index 100%
rename from include/ufs/ufs/ufs_wapbl.h
rename to sys/ufs/ufs/ufs_wapbl.h
diff --git a/include/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h
similarity index 100%
rename from include/ufs/ufs/ufsmount.h
rename to sys/ufs/ufs/ufsmount.h
diff --git a/tools/nbsd_ports b/tools/nbsd_ports
index ea9ee9469..bf059ec0b 100644
--- a/tools/nbsd_ports
+++ b/tools/nbsd_ports
@@ -2,16 +2,16 @@
 # Timestamp in UTC,minixpath,netbsdpath
 # minixpath:  path in Minix source tree (starting from /usr/src/)
 # netbsdpath: path in BSD source tree (starting from src/)
+2011/12/25 06:09:09,sys/arch/i386/stand
 2012/02/10 16:16:12,share/zoneinfo
 2011/05/26 00:00:00,external/public-domain/xz
 2011/09/30 01:32:21,usr.bin/gzip
 2011/08/27 12:55:09,bin/date
 2011/10/17 09:24:54,common/lib/libprop
-2011/11/28 12:50:07,include/ufs,sys/ufs
+2011/11/28 12:50:07,sys/ufs
 2010/09/10 15:51:20,sbin/newfs_ext2fs
 2011/09/16 16:13:18,sbin/fsck_ext2fs
 2011/09/30 22:08:19,lib/libprop
-2011/08/30 12:39:55,common/include/arch/i386,sys/arch/i386/include
 2011/11/13 22:19:09,common/include
 2011/01/17 18:11:10,common/lib/libc
 2011/01/21 23:36:49,lib/libc
@@ -40,7 +40,7 @@
 2011/09/01 13:37:33,usr.bin/du
 2010/07/07 21:24:34,usr.bin/man
 2009/05/08 12:48:43,usr.bin/apropos
-2011/01/12 23:02:22,usr.bin/mdocml,external/bsd/mdocml
+2011/01/12 23:02:22,external/bsd/mdocml
 2011/11/03 20:46:41,usr.sbin/installboot
 2011/01/04 10:01:51,usr.sbin/pwd_mkdb
 2011/01/04 10:30:21,usr.sbin/user
@@ -50,10 +50,5 @@
 2007/05/28 12:06:25,usr.bin/bzip2recover
 2009/04/02 21:39:33,libexec/makewhatis
 2010/05/14 16:43:34,dist/bzip2
-2011/08/17 00:07:38,sys/arch/i386/stand/bootxx
-2011/12/25 06:09:09,sys/arch/i386/stand/boot
-2011/05/20 22:29:55,sys/arch/i386/stand/cdboot
-2011/09/21 18:15:59,sys/arch/i386/stand/mbr
-2011/11/28 07:56:54,sys/arch/i386/stand/lib
 2012/01/16 18:47:57,sys/lib/libsa
 2011/10/30 00:28:57,sys/lib/libz
diff --git a/usr.bin/Makefile b/usr.bin/Makefile
index 9674cd9ac..6528d9318 100644
--- a/usr.bin/Makefile
+++ b/usr.bin/Makefile
@@ -3,7 +3,7 @@
 .include <bsd.own.mk>
 
 # NetBSD imports
-SUBDIR= indent m4 stat tic sed mkdep uniq seq du man mdocml \
+SUBDIR= indent m4 stat tic sed mkdep uniq seq du man \
 	apropos chpass newgrp passwd bzip2 bzip2recover gzip
 
 # Non-NetBSD imports