diff --git a/Makefile b/Makefile index e9e111758..fa49ec775 100644 --- a/Makefile +++ b/Makefile @@ -37,6 +37,7 @@ mkfiles: includes: $(MAKE) -C include includes $(MAKE) -C lib includes NBSD_LIBC=yes + $(MAKE) -C sys includes MKHEADERSS=/usr/pkg/gcc*/libexec/gcc/*/*/install-tools/mkheaders gnu-includes: includes @@ -50,6 +51,7 @@ commands: includes libraries $(MAKE) -C bin all $(MAKE) -C sbin all $(MAKE) -C usr.bin all + $(MAKE) -C external all $(MAKE) -C libexec all $(MAKE) -C usr.sbin all @@ -59,6 +61,7 @@ dep-all: $(MAKE) -C bin dependall $(MAKE) -C sbin dependall $(MAKE) -C usr.bin dependall + $(MAKE) -C external dependall $(MAKE) -C libexec dependall $(MAKE) -C usr.sbin dependall $(MAKE) -C kernel dependall @@ -77,6 +80,7 @@ all: $(MAKE) -C bin all $(MAKE) -C sbin all $(MAKE) -C usr.bin all + $(MAKE) -C external all $(MAKE) -C libexec all $(MAKE) -C usr.sbin all $(MAKE) -C tools all @@ -89,6 +93,7 @@ install: $(MAKE) -C bin install $(MAKE) -C sbin install $(MAKE) -C usr.bin install + $(MAKE) -C external install $(MAKE) -C usr.sbin install $(MAKE) -C servers install $(MAKE) -C share install @@ -100,6 +105,7 @@ clean: mkfiles $(MAKE) -C bin clean $(MAKE) -C sbin clean $(MAKE) -C usr.bin clean + $(MAKE) -C external clean $(MAKE) -C libexec clean $(MAKE) -C usr.sbin clean $(MAKE) -C share clean @@ -114,6 +120,7 @@ cleandepend: mkfiles $(MAKE) -C bin cleandepend $(MAKE) -C sbin cleandepend $(MAKE) -C usr.bin cleandepend + $(MAKE) -C external cleandepend $(MAKE) -C libexec cleandepend $(MAKE) -C usr.sbin cleandepend $(MAKE) -C tools cleandepend diff --git a/common/include/arch/i386/disklabel.h b/common/include/arch/i386/disklabel.h deleted file mode 100644 index e7d5246bc..000000000 --- a/common/include/arch/i386/disklabel.h +++ /dev/null @@ -1,75 +0,0 @@ -/* $NetBSD: disklabel.h,v 1.16 2011/08/30 12:39:55 bouyer Exp $ */ - -/* - * Copyright (c) 1994 Christopher G. Demetriou - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Christopher G. Demetriou. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _I386_DISKLABEL_H_ -#define _I386_DISKLABEL_H_ - -#define LABELUSESMBR 1 /* use MBR partitionning */ -#define LABELSECTOR 1 /* sector containing label */ -#define LABELOFFSET 0 /* offset of label in sector */ -#define MAXPARTITIONS 16 /* number of partitions */ -#define OLDMAXPARTITIONS 8 /* number of partitions before 1.6 */ -#define RAW_PART 3 /* raw partition: XX?d (XXX) */ - -/* - * We use the highest bit of the minor number for the partition number. - * This maintains backward compatibility with device nodes created before - * MAXPARTITIONS was increased. - */ -#define __I386_MAXDISKS ((1 << 20) / MAXPARTITIONS) -#define DISKUNIT(dev) ((minor(dev) / OLDMAXPARTITIONS) % __I386_MAXDISKS) -#define DISKPART(dev) ((minor(dev) % OLDMAXPARTITIONS) + \ - ((minor(dev) / (__I386_MAXDISKS * OLDMAXPARTITIONS)) * OLDMAXPARTITIONS)) -#define DISKMINOR(unit, part) \ - (((unit) * OLDMAXPARTITIONS) + ((part) % OLDMAXPARTITIONS) + \ - ((part) / OLDMAXPARTITIONS) * (__I386_MAXDISKS * OLDMAXPARTITIONS)) - -/* Pull in MBR partition definitions. */ -#if HAVE_NBTOOL_CONFIG_H -#include -#else -#include -#endif /* HAVE_NBTOOL_CONFIG_H */ - -#ifndef __ASSEMBLER__ -#if HAVE_NBTOOL_CONFIG_H -#include -#else -#include -#endif /* HAVE_NBTOOL_CONFIG_H */ -struct cpu_disklabel { -#define __HAVE_DISKLABEL_DKBAD - struct dkbad bad; -}; -#endif - -#endif /* _I386_DISKLABEL_H_ */ diff --git a/external/Makefile b/external/Makefile new file mode 100644 index 000000000..4072adc24 --- /dev/null +++ b/external/Makefile @@ -0,0 +1,3 @@ +SUBDIR=bsd + +.include diff --git a/external/bsd/Makefile b/external/bsd/Makefile new file mode 100644 index 000000000..58740673e --- /dev/null +++ b/external/bsd/Makefile @@ -0,0 +1,3 @@ +.include +SUBDIR=mdocml +.include diff --git a/usr.bin/mdocml/Makefile b/external/bsd/mdocml/Makefile similarity index 100% rename from usr.bin/mdocml/Makefile rename to external/bsd/mdocml/Makefile diff --git a/usr.bin/mdocml/Makefile.inc b/external/bsd/mdocml/Makefile.inc similarity index 95% rename from usr.bin/mdocml/Makefile.inc rename to external/bsd/mdocml/Makefile.inc index 7368b890c..6dd59dd61 100644 --- a/usr.bin/mdocml/Makefile.inc +++ b/external/bsd/mdocml/Makefile.inc @@ -1,7 +1,6 @@ # $NetBSD: Makefile.inc,v 1.12 2010/07/25 19:16:18 joerg Exp $ .include -.include "../Makefile.inc" VERSION!= cd ${.PARSEDIR}/dist && ${MAKE} -V VERSION diff --git a/usr.bin/mdocml/bin/Makefile b/external/bsd/mdocml/bin/Makefile similarity index 100% rename from usr.bin/mdocml/bin/Makefile rename to external/bsd/mdocml/bin/Makefile diff --git a/usr.bin/mdocml/bin/Makefile.inc b/external/bsd/mdocml/bin/Makefile.inc similarity index 100% rename from usr.bin/mdocml/bin/Makefile.inc rename to external/bsd/mdocml/bin/Makefile.inc diff --git a/usr.bin/mdocml/bin/mandoc/Makefile b/external/bsd/mdocml/bin/mandoc/Makefile similarity index 100% rename from usr.bin/mdocml/bin/mandoc/Makefile rename to external/bsd/mdocml/bin/mandoc/Makefile diff --git a/usr.bin/mdocml/dist/Makefile b/external/bsd/mdocml/dist/Makefile similarity index 100% rename from usr.bin/mdocml/dist/Makefile rename to external/bsd/mdocml/dist/Makefile diff --git a/usr.bin/mdocml/dist/arch.c b/external/bsd/mdocml/dist/arch.c similarity index 100% rename from usr.bin/mdocml/dist/arch.c rename to external/bsd/mdocml/dist/arch.c diff --git a/usr.bin/mdocml/dist/arch.in b/external/bsd/mdocml/dist/arch.in similarity index 100% rename from usr.bin/mdocml/dist/arch.in rename to external/bsd/mdocml/dist/arch.in diff --git a/usr.bin/mdocml/dist/att.c b/external/bsd/mdocml/dist/att.c similarity index 100% rename from usr.bin/mdocml/dist/att.c rename to external/bsd/mdocml/dist/att.c diff --git a/usr.bin/mdocml/dist/att.in b/external/bsd/mdocml/dist/att.in similarity index 100% rename from usr.bin/mdocml/dist/att.in rename to external/bsd/mdocml/dist/att.in diff --git a/usr.bin/mdocml/dist/chars.c b/external/bsd/mdocml/dist/chars.c similarity index 100% rename from usr.bin/mdocml/dist/chars.c rename to external/bsd/mdocml/dist/chars.c diff --git a/usr.bin/mdocml/dist/chars.h b/external/bsd/mdocml/dist/chars.h similarity index 100% rename from usr.bin/mdocml/dist/chars.h rename to external/bsd/mdocml/dist/chars.h diff --git a/usr.bin/mdocml/dist/chars.in b/external/bsd/mdocml/dist/chars.in similarity index 100% rename from usr.bin/mdocml/dist/chars.in rename to external/bsd/mdocml/dist/chars.in diff --git a/usr.bin/mdocml/dist/compat.c b/external/bsd/mdocml/dist/compat.c similarity index 100% rename from usr.bin/mdocml/dist/compat.c rename to external/bsd/mdocml/dist/compat.c diff --git a/usr.bin/mdocml/dist/config.h.post b/external/bsd/mdocml/dist/config.h.post similarity index 100% rename from usr.bin/mdocml/dist/config.h.post rename to external/bsd/mdocml/dist/config.h.post diff --git a/usr.bin/mdocml/dist/config.h.pre b/external/bsd/mdocml/dist/config.h.pre similarity index 100% rename from usr.bin/mdocml/dist/config.h.pre rename to external/bsd/mdocml/dist/config.h.pre diff --git a/usr.bin/mdocml/dist/example.style.css b/external/bsd/mdocml/dist/example.style.css similarity index 100% rename from usr.bin/mdocml/dist/example.style.css rename to external/bsd/mdocml/dist/example.style.css diff --git a/usr.bin/mdocml/dist/external.png.uu b/external/bsd/mdocml/dist/external.png.uu similarity index 100% rename from usr.bin/mdocml/dist/external.png.uu rename to external/bsd/mdocml/dist/external.png.uu diff --git a/usr.bin/mdocml/dist/html.c b/external/bsd/mdocml/dist/html.c similarity index 100% rename from usr.bin/mdocml/dist/html.c rename to external/bsd/mdocml/dist/html.c diff --git a/usr.bin/mdocml/dist/html.h b/external/bsd/mdocml/dist/html.h similarity index 100% rename from usr.bin/mdocml/dist/html.h rename to external/bsd/mdocml/dist/html.h diff --git a/usr.bin/mdocml/dist/lib.c b/external/bsd/mdocml/dist/lib.c similarity index 100% rename from usr.bin/mdocml/dist/lib.c rename to external/bsd/mdocml/dist/lib.c diff --git a/usr.bin/mdocml/dist/lib.in b/external/bsd/mdocml/dist/lib.in similarity index 100% rename from usr.bin/mdocml/dist/lib.in rename to external/bsd/mdocml/dist/lib.in diff --git a/usr.bin/mdocml/dist/libman.h b/external/bsd/mdocml/dist/libman.h similarity index 100% rename from usr.bin/mdocml/dist/libman.h rename to external/bsd/mdocml/dist/libman.h diff --git a/usr.bin/mdocml/dist/libmandoc.h b/external/bsd/mdocml/dist/libmandoc.h similarity index 100% rename from usr.bin/mdocml/dist/libmandoc.h rename to external/bsd/mdocml/dist/libmandoc.h diff --git a/usr.bin/mdocml/dist/libmdoc.h b/external/bsd/mdocml/dist/libmdoc.h similarity index 100% rename from usr.bin/mdocml/dist/libmdoc.h rename to external/bsd/mdocml/dist/libmdoc.h diff --git a/usr.bin/mdocml/dist/libroff.h b/external/bsd/mdocml/dist/libroff.h similarity index 100% rename from usr.bin/mdocml/dist/libroff.h rename to external/bsd/mdocml/dist/libroff.h diff --git a/usr.bin/mdocml/dist/main.c b/external/bsd/mdocml/dist/main.c similarity index 100% rename from usr.bin/mdocml/dist/main.c rename to external/bsd/mdocml/dist/main.c diff --git a/usr.bin/mdocml/dist/main.h b/external/bsd/mdocml/dist/main.h similarity index 100% rename from usr.bin/mdocml/dist/main.h rename to external/bsd/mdocml/dist/main.h diff --git a/usr.bin/mdocml/dist/man.3 b/external/bsd/mdocml/dist/man.3 similarity index 100% rename from usr.bin/mdocml/dist/man.3 rename to external/bsd/mdocml/dist/man.3 diff --git a/usr.bin/mdocml/dist/man.7 b/external/bsd/mdocml/dist/man.7 similarity index 100% rename from usr.bin/mdocml/dist/man.7 rename to external/bsd/mdocml/dist/man.7 diff --git a/usr.bin/mdocml/dist/man.c b/external/bsd/mdocml/dist/man.c similarity index 100% rename from usr.bin/mdocml/dist/man.c rename to external/bsd/mdocml/dist/man.c diff --git a/usr.bin/mdocml/dist/man.h b/external/bsd/mdocml/dist/man.h similarity index 100% rename from usr.bin/mdocml/dist/man.h rename to external/bsd/mdocml/dist/man.h diff --git a/usr.bin/mdocml/dist/man_argv.c b/external/bsd/mdocml/dist/man_argv.c similarity index 100% rename from usr.bin/mdocml/dist/man_argv.c rename to external/bsd/mdocml/dist/man_argv.c diff --git a/usr.bin/mdocml/dist/man_hash.c b/external/bsd/mdocml/dist/man_hash.c similarity index 100% rename from usr.bin/mdocml/dist/man_hash.c rename to external/bsd/mdocml/dist/man_hash.c diff --git a/usr.bin/mdocml/dist/man_html.c b/external/bsd/mdocml/dist/man_html.c similarity index 100% rename from usr.bin/mdocml/dist/man_html.c rename to external/bsd/mdocml/dist/man_html.c diff --git a/usr.bin/mdocml/dist/man_macro.c b/external/bsd/mdocml/dist/man_macro.c similarity index 100% rename from usr.bin/mdocml/dist/man_macro.c rename to external/bsd/mdocml/dist/man_macro.c diff --git a/usr.bin/mdocml/dist/man_term.c b/external/bsd/mdocml/dist/man_term.c similarity index 100% rename from usr.bin/mdocml/dist/man_term.c rename to external/bsd/mdocml/dist/man_term.c diff --git a/usr.bin/mdocml/dist/man_validate.c b/external/bsd/mdocml/dist/man_validate.c similarity index 100% rename from usr.bin/mdocml/dist/man_validate.c rename to external/bsd/mdocml/dist/man_validate.c diff --git a/usr.bin/mdocml/dist/mandoc b/external/bsd/mdocml/dist/mandoc similarity index 100% rename from usr.bin/mdocml/dist/mandoc rename to external/bsd/mdocml/dist/mandoc diff --git a/usr.bin/mdocml/dist/mandoc.1 b/external/bsd/mdocml/dist/mandoc.1 similarity index 100% rename from usr.bin/mdocml/dist/mandoc.1 rename to external/bsd/mdocml/dist/mandoc.1 diff --git a/usr.bin/mdocml/dist/mandoc.c b/external/bsd/mdocml/dist/mandoc.c similarity index 100% rename from usr.bin/mdocml/dist/mandoc.c rename to external/bsd/mdocml/dist/mandoc.c diff --git a/usr.bin/mdocml/dist/mandoc.h b/external/bsd/mdocml/dist/mandoc.h similarity index 100% rename from usr.bin/mdocml/dist/mandoc.h rename to external/bsd/mdocml/dist/mandoc.h diff --git a/usr.bin/mdocml/dist/mandoc_char.7 b/external/bsd/mdocml/dist/mandoc_char.7 similarity index 100% rename from usr.bin/mdocml/dist/mandoc_char.7 rename to external/bsd/mdocml/dist/mandoc_char.7 diff --git a/usr.bin/mdocml/dist/mdoc.3 b/external/bsd/mdocml/dist/mdoc.3 similarity index 100% rename from usr.bin/mdocml/dist/mdoc.3 rename to external/bsd/mdocml/dist/mdoc.3 diff --git a/usr.bin/mdocml/dist/mdoc.7 b/external/bsd/mdocml/dist/mdoc.7 similarity index 100% rename from usr.bin/mdocml/dist/mdoc.7 rename to external/bsd/mdocml/dist/mdoc.7 diff --git a/usr.bin/mdocml/dist/mdoc.c b/external/bsd/mdocml/dist/mdoc.c similarity index 100% rename from usr.bin/mdocml/dist/mdoc.c rename to external/bsd/mdocml/dist/mdoc.c diff --git a/usr.bin/mdocml/dist/mdoc.h b/external/bsd/mdocml/dist/mdoc.h similarity index 100% rename from usr.bin/mdocml/dist/mdoc.h rename to external/bsd/mdocml/dist/mdoc.h diff --git a/usr.bin/mdocml/dist/mdoc_argv.c b/external/bsd/mdocml/dist/mdoc_argv.c similarity index 100% rename from usr.bin/mdocml/dist/mdoc_argv.c rename to external/bsd/mdocml/dist/mdoc_argv.c diff --git a/usr.bin/mdocml/dist/mdoc_hash.c b/external/bsd/mdocml/dist/mdoc_hash.c similarity index 100% rename from usr.bin/mdocml/dist/mdoc_hash.c rename to external/bsd/mdocml/dist/mdoc_hash.c diff --git a/usr.bin/mdocml/dist/mdoc_html.c b/external/bsd/mdocml/dist/mdoc_html.c similarity index 100% rename from usr.bin/mdocml/dist/mdoc_html.c rename to external/bsd/mdocml/dist/mdoc_html.c diff --git a/usr.bin/mdocml/dist/mdoc_macro.c b/external/bsd/mdocml/dist/mdoc_macro.c similarity index 100% rename from usr.bin/mdocml/dist/mdoc_macro.c rename to external/bsd/mdocml/dist/mdoc_macro.c diff --git a/usr.bin/mdocml/dist/mdoc_strings.c b/external/bsd/mdocml/dist/mdoc_strings.c similarity index 100% rename from usr.bin/mdocml/dist/mdoc_strings.c rename to external/bsd/mdocml/dist/mdoc_strings.c diff --git a/usr.bin/mdocml/dist/mdoc_term.c b/external/bsd/mdocml/dist/mdoc_term.c similarity index 100% rename from usr.bin/mdocml/dist/mdoc_term.c rename to external/bsd/mdocml/dist/mdoc_term.c diff --git a/usr.bin/mdocml/dist/mdoc_validate.c b/external/bsd/mdocml/dist/mdoc_validate.c similarity index 100% rename from usr.bin/mdocml/dist/mdoc_validate.c rename to external/bsd/mdocml/dist/mdoc_validate.c diff --git a/usr.bin/mdocml/dist/msec.c b/external/bsd/mdocml/dist/msec.c similarity index 100% rename from usr.bin/mdocml/dist/msec.c rename to external/bsd/mdocml/dist/msec.c diff --git a/usr.bin/mdocml/dist/msec.in b/external/bsd/mdocml/dist/msec.in similarity index 100% rename from usr.bin/mdocml/dist/msec.in rename to external/bsd/mdocml/dist/msec.in diff --git a/usr.bin/mdocml/dist/out.c b/external/bsd/mdocml/dist/out.c similarity index 100% rename from usr.bin/mdocml/dist/out.c rename to external/bsd/mdocml/dist/out.c diff --git a/usr.bin/mdocml/dist/out.h b/external/bsd/mdocml/dist/out.h similarity index 100% rename from usr.bin/mdocml/dist/out.h rename to external/bsd/mdocml/dist/out.h diff --git a/usr.bin/mdocml/dist/roff.3 b/external/bsd/mdocml/dist/roff.3 similarity index 100% rename from usr.bin/mdocml/dist/roff.3 rename to external/bsd/mdocml/dist/roff.3 diff --git a/usr.bin/mdocml/dist/roff.7 b/external/bsd/mdocml/dist/roff.7 similarity index 100% rename from usr.bin/mdocml/dist/roff.7 rename to external/bsd/mdocml/dist/roff.7 diff --git a/usr.bin/mdocml/dist/roff.c b/external/bsd/mdocml/dist/roff.c similarity index 100% rename from usr.bin/mdocml/dist/roff.c rename to external/bsd/mdocml/dist/roff.c diff --git a/usr.bin/mdocml/dist/roff.h b/external/bsd/mdocml/dist/roff.h similarity index 100% rename from usr.bin/mdocml/dist/roff.h rename to external/bsd/mdocml/dist/roff.h diff --git a/usr.bin/mdocml/dist/st.c b/external/bsd/mdocml/dist/st.c similarity index 100% rename from usr.bin/mdocml/dist/st.c rename to external/bsd/mdocml/dist/st.c diff --git a/usr.bin/mdocml/dist/st.in b/external/bsd/mdocml/dist/st.in similarity index 100% rename from usr.bin/mdocml/dist/st.in rename to external/bsd/mdocml/dist/st.in diff --git a/usr.bin/mdocml/dist/tbl.7 b/external/bsd/mdocml/dist/tbl.7 similarity index 100% rename from usr.bin/mdocml/dist/tbl.7 rename to external/bsd/mdocml/dist/tbl.7 diff --git a/usr.bin/mdocml/dist/tbl.c b/external/bsd/mdocml/dist/tbl.c similarity index 100% rename from usr.bin/mdocml/dist/tbl.c rename to external/bsd/mdocml/dist/tbl.c diff --git a/usr.bin/mdocml/dist/tbl_data.c b/external/bsd/mdocml/dist/tbl_data.c similarity index 100% rename from usr.bin/mdocml/dist/tbl_data.c rename to external/bsd/mdocml/dist/tbl_data.c diff --git a/usr.bin/mdocml/dist/tbl_html.c b/external/bsd/mdocml/dist/tbl_html.c similarity index 100% rename from usr.bin/mdocml/dist/tbl_html.c rename to external/bsd/mdocml/dist/tbl_html.c diff --git a/usr.bin/mdocml/dist/tbl_layout.c b/external/bsd/mdocml/dist/tbl_layout.c similarity index 100% rename from usr.bin/mdocml/dist/tbl_layout.c rename to external/bsd/mdocml/dist/tbl_layout.c diff --git a/usr.bin/mdocml/dist/tbl_opts.c b/external/bsd/mdocml/dist/tbl_opts.c similarity index 100% rename from usr.bin/mdocml/dist/tbl_opts.c rename to external/bsd/mdocml/dist/tbl_opts.c diff --git a/usr.bin/mdocml/dist/tbl_term.c b/external/bsd/mdocml/dist/tbl_term.c similarity index 100% rename from usr.bin/mdocml/dist/tbl_term.c rename to external/bsd/mdocml/dist/tbl_term.c diff --git a/usr.bin/mdocml/dist/term.c b/external/bsd/mdocml/dist/term.c similarity index 100% rename from usr.bin/mdocml/dist/term.c rename to external/bsd/mdocml/dist/term.c diff --git a/usr.bin/mdocml/dist/term.h b/external/bsd/mdocml/dist/term.h similarity index 100% rename from usr.bin/mdocml/dist/term.h rename to external/bsd/mdocml/dist/term.h diff --git a/usr.bin/mdocml/dist/term_ascii.c b/external/bsd/mdocml/dist/term_ascii.c similarity index 100% rename from usr.bin/mdocml/dist/term_ascii.c rename to external/bsd/mdocml/dist/term_ascii.c diff --git a/usr.bin/mdocml/dist/term_ps.c b/external/bsd/mdocml/dist/term_ps.c similarity index 100% rename from usr.bin/mdocml/dist/term_ps.c rename to external/bsd/mdocml/dist/term_ps.c diff --git a/usr.bin/mdocml/dist/test-strlcat.c b/external/bsd/mdocml/dist/test-strlcat.c similarity index 100% rename from usr.bin/mdocml/dist/test-strlcat.c rename to external/bsd/mdocml/dist/test-strlcat.c diff --git a/usr.bin/mdocml/dist/test-strlcpy.c b/external/bsd/mdocml/dist/test-strlcpy.c similarity index 100% rename from usr.bin/mdocml/dist/test-strlcpy.c rename to external/bsd/mdocml/dist/test-strlcpy.c diff --git a/usr.bin/mdocml/dist/tree.c b/external/bsd/mdocml/dist/tree.c similarity index 100% rename from usr.bin/mdocml/dist/tree.c rename to external/bsd/mdocml/dist/tree.c diff --git a/usr.bin/mdocml/dist/vol.c b/external/bsd/mdocml/dist/vol.c similarity index 100% rename from usr.bin/mdocml/dist/vol.c rename to external/bsd/mdocml/dist/vol.c diff --git a/usr.bin/mdocml/dist/vol.in b/external/bsd/mdocml/dist/vol.in similarity index 100% rename from usr.bin/mdocml/dist/vol.in rename to external/bsd/mdocml/dist/vol.in diff --git a/usr.bin/mdocml/lib/Makefile b/external/bsd/mdocml/lib/Makefile similarity index 100% rename from usr.bin/mdocml/lib/Makefile rename to external/bsd/mdocml/lib/Makefile diff --git a/usr.bin/mdocml/lib/Makefile.inc b/external/bsd/mdocml/lib/Makefile.inc similarity index 100% rename from usr.bin/mdocml/lib/Makefile.inc rename to external/bsd/mdocml/lib/Makefile.inc diff --git a/usr.bin/mdocml/lib/libman/Makefile b/external/bsd/mdocml/lib/libman/Makefile similarity index 100% rename from usr.bin/mdocml/lib/libman/Makefile rename to external/bsd/mdocml/lib/libman/Makefile diff --git a/usr.bin/mdocml/lib/libmdoc/Makefile b/external/bsd/mdocml/lib/libmdoc/Makefile similarity index 100% rename from usr.bin/mdocml/lib/libmdoc/Makefile rename to external/bsd/mdocml/lib/libmdoc/Makefile diff --git a/usr.bin/mdocml/lib/libroff/Makefile b/external/bsd/mdocml/lib/libroff/Makefile similarity index 100% rename from usr.bin/mdocml/lib/libroff/Makefile rename to external/bsd/mdocml/lib/libroff/Makefile diff --git a/usr.bin/mdocml/man/Makefile b/external/bsd/mdocml/man/Makefile similarity index 100% rename from usr.bin/mdocml/man/Makefile rename to external/bsd/mdocml/man/Makefile diff --git a/usr.bin/mdocml/prepare-import.sh b/external/bsd/mdocml/prepare-import.sh similarity index 100% rename from usr.bin/mdocml/prepare-import.sh rename to external/bsd/mdocml/prepare-import.sh diff --git a/include/Makefile b/include/Makefile index e95fdcda4..5a120d103 100644 --- a/include/Makefile +++ b/include/Makefile @@ -25,19 +25,6 @@ INCS= a.out.h aio.h ar.h assert.h atomic.h \ ttyent.h tzfile.h ucontext.h ulimit.h unistd.h util.h utime.h utmp.h \ utmpx.h uuid.h varargs.h vis.h wchar.h wctype.h wordexp.h -INCS += ufs/chfs/chfs.h ufs/chfs/chfs_args.h ufs/chfs/chfs_inode.h \ - ufs/chfs/chfs_pool.h ufs/chfs/debug.h ufs/chfs/ebh.h \ - ufs/chfs/ebh_media.h ufs/chfs/ebh_misc.h ufs/chfs/media.h \ - ufs/ext2fs/ext2fs.h ufs/ext2fs/ext2fs_dinode.h \ - ufs/ext2fs/ext2fs_dir.h ufs/ext2fs/ext2fs_extern.h \ - ufs/ffs/ffs_extern.h ufs/ffs/fs.h ufs/lfs/lfs.h \ - ufs/lfs/lfs_extern.h ufs/mfs/mfs_extern.h ufs/mfs/mfsnode.h \ - ufs/ufs/dinode.h ufs/ufs/dir.h ufs/ufs/dirhash.h \ - ufs/ufs/extattr.h ufs/ufs/inode.h ufs/ufs/quota.h \ - ufs/ufs/quota1.h ufs/ufs/quota2.h ufs/ufs/ufs_bswap.h \ - ufs/ufs/ufs_extern.h ufs/ufs/ufs_quota.h ufs/ufs/ufs_wapbl.h \ - ufs/ufs/ufsmount.h \ - .else INCS= a.out.h aio.h ar.h assert.h atomic.h \ bitstring.h bm.h cdbr.h cdbw.h complex.h cpio.h ctype.h \ diff --git a/include/arch/i386/include/Makefile b/include/arch/i386/include/Makefile index 93281aee8..8a1150c65 100644 --- a/include/arch/i386/include/Makefile +++ b/include/arch/i386/include/Makefile @@ -9,8 +9,12 @@ INCS= ansi.h asm.h bswap.h byte_swap.h cdefs.h \ int_mwgwtypes.h int_types.h limits.h \ math.h mcontext.h npx.h param.h profile.h \ setjmp.h signal.h stdarg.h types.h \ - vmparam.h wchar_limits.h + vmparam.h wchar_limits.h \ + archtypes.h bios.h cmos.h cpu.h diskparm.h fpu.h int86.h \ + interrupt.h memory.h multiboot.h partition.h \ + pci.h pci_amd.h pci_intel.h pci_sis.h pci_via.h \ + ports.h stackframe.h vm.h elf.h elf_machdep.h mutex.h \ + disklabel.h -.include "${MINIXSRCDIR}/common/include/arch/i386/Makefile.inc" .include diff --git a/common/include/arch/i386/Makefile.inc b/include/arch/i386/include/Makefile.inc similarity index 100% rename from common/include/arch/i386/Makefile.inc rename to include/arch/i386/include/Makefile.inc diff --git a/common/include/arch/i386/archtypes.h b/include/arch/i386/include/archtypes.h similarity index 100% rename from common/include/arch/i386/archtypes.h rename to include/arch/i386/include/archtypes.h diff --git a/common/include/arch/i386/bios.h b/include/arch/i386/include/bios.h similarity index 100% rename from common/include/arch/i386/bios.h rename to include/arch/i386/include/bios.h diff --git a/common/include/arch/i386/cmos.h b/include/arch/i386/include/cmos.h similarity index 100% rename from common/include/arch/i386/cmos.h rename to include/arch/i386/include/cmos.h diff --git a/common/include/arch/i386/cpu.h b/include/arch/i386/include/cpu.h similarity index 100% rename from common/include/arch/i386/cpu.h rename to include/arch/i386/include/cpu.h diff --git a/include/arch/i386/include/disklabel.h b/include/arch/i386/include/disklabel.h index bf567de6d..e7d5246bc 100644 --- a/include/arch/i386/include/disklabel.h +++ b/include/arch/i386/include/disklabel.h @@ -1,4 +1,4 @@ -/* $NetBSD: disklabel.h,v 1.15 2009/11/23 13:40:10 pooka Exp $ */ +/* $NetBSD: disklabel.h,v 1.16 2011/08/30 12:39:55 bouyer Exp $ */ /* * Copyright (c) 1994 Christopher G. Demetriou @@ -33,6 +33,7 @@ #ifndef _I386_DISKLABEL_H_ #define _I386_DISKLABEL_H_ +#define LABELUSESMBR 1 /* use MBR partitionning */ #define LABELSECTOR 1 /* sector containing label */ #define LABELOFFSET 0 /* offset of label in sector */ #define MAXPARTITIONS 16 /* number of partitions */ diff --git a/common/include/arch/i386/diskparm.h b/include/arch/i386/include/diskparm.h similarity index 100% rename from common/include/arch/i386/diskparm.h rename to include/arch/i386/include/diskparm.h diff --git a/common/include/arch/i386/elf.h b/include/arch/i386/include/elf.h similarity index 100% rename from common/include/arch/i386/elf.h rename to include/arch/i386/include/elf.h diff --git a/common/include/arch/i386/elf_machdep.h b/include/arch/i386/include/elf_machdep.h similarity index 100% rename from common/include/arch/i386/elf_machdep.h rename to include/arch/i386/include/elf_machdep.h diff --git a/common/include/arch/i386/fpu.h b/include/arch/i386/include/fpu.h similarity index 100% rename from common/include/arch/i386/fpu.h rename to include/arch/i386/include/fpu.h diff --git a/common/include/arch/i386/int86.h b/include/arch/i386/include/int86.h similarity index 100% rename from common/include/arch/i386/int86.h rename to include/arch/i386/include/int86.h diff --git a/common/include/arch/i386/interrupt.h b/include/arch/i386/include/interrupt.h similarity index 100% rename from common/include/arch/i386/interrupt.h rename to include/arch/i386/include/interrupt.h diff --git a/common/include/arch/i386/memory.h b/include/arch/i386/include/memory.h similarity index 100% rename from common/include/arch/i386/memory.h rename to include/arch/i386/include/memory.h diff --git a/common/include/arch/i386/multiboot.h b/include/arch/i386/include/multiboot.h similarity index 100% rename from common/include/arch/i386/multiboot.h rename to include/arch/i386/include/multiboot.h diff --git a/common/include/arch/i386/mutex.h b/include/arch/i386/include/mutex.h similarity index 100% rename from common/include/arch/i386/mutex.h rename to include/arch/i386/include/mutex.h diff --git a/common/include/arch/i386/partition.h b/include/arch/i386/include/partition.h similarity index 100% rename from common/include/arch/i386/partition.h rename to include/arch/i386/include/partition.h diff --git a/common/include/arch/i386/pci.h b/include/arch/i386/include/pci.h similarity index 100% rename from common/include/arch/i386/pci.h rename to include/arch/i386/include/pci.h diff --git a/common/include/arch/i386/pci_amd.h b/include/arch/i386/include/pci_amd.h similarity index 100% rename from common/include/arch/i386/pci_amd.h rename to include/arch/i386/include/pci_amd.h diff --git a/common/include/arch/i386/pci_intel.h b/include/arch/i386/include/pci_intel.h similarity index 100% rename from common/include/arch/i386/pci_intel.h rename to include/arch/i386/include/pci_intel.h diff --git a/common/include/arch/i386/pci_sis.h b/include/arch/i386/include/pci_sis.h similarity index 100% rename from common/include/arch/i386/pci_sis.h rename to include/arch/i386/include/pci_sis.h diff --git a/common/include/arch/i386/pci_via.h b/include/arch/i386/include/pci_via.h similarity index 100% rename from common/include/arch/i386/pci_via.h rename to include/arch/i386/include/pci_via.h diff --git a/common/include/arch/i386/ports.h b/include/arch/i386/include/ports.h similarity index 100% rename from common/include/arch/i386/ports.h rename to include/arch/i386/include/ports.h diff --git a/common/include/arch/i386/stackframe.h b/include/arch/i386/include/stackframe.h similarity index 100% rename from common/include/arch/i386/stackframe.h rename to include/arch/i386/include/stackframe.h diff --git a/common/include/arch/i386/vm.h b/include/arch/i386/include/vm.h similarity index 100% rename from common/include/arch/i386/vm.h rename to include/arch/i386/include/vm.h diff --git a/sys/Makefile b/sys/Makefile index 3bee04ec7..1861b4903 100644 --- a/sys/Makefile +++ b/sys/Makefile @@ -6,5 +6,6 @@ SUBDIR= arch/i386/stand/mbr SUBDIR+= arch/i386/stand/bootxx SUBDIR+= arch/i386/stand/boot SUBDIR+= arch/i386/stand/cdboot +SUBDIR+= ufs .include diff --git a/sys/ufs/Makefile b/sys/ufs/Makefile new file mode 100644 index 000000000..c06bbf848 --- /dev/null +++ b/sys/ufs/Makefile @@ -0,0 +1,7 @@ +# $NetBSD: Makefile,v 1.2 2002/11/26 23:30:35 lukem Exp $ + +SUBDIR= ffs lfs mfs ufs ext2fs + +INCSDIR= /usr/include/ufs + +.include diff --git a/include/ufs/chfs/chfs.h b/sys/ufs/chfs/chfs.h similarity index 100% rename from include/ufs/chfs/chfs.h rename to sys/ufs/chfs/chfs.h diff --git a/include/ufs/chfs/chfs_args.h b/sys/ufs/chfs/chfs_args.h similarity index 100% rename from include/ufs/chfs/chfs_args.h rename to sys/ufs/chfs/chfs_args.h diff --git a/sys/ufs/chfs/chfs_build.c b/sys/ufs/chfs/chfs_build.c new file mode 100644 index 000000000..3904b023a --- /dev/null +++ b/sys/ufs/chfs/chfs_build.c @@ -0,0 +1,405 @@ +/* $NetBSD: chfs_build.c,v 1.2 2011/11/24 21:22:39 agc Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "chfs.h" +//#include + + +void +chfs_calc_trigger_levels(struct chfs_mount *chmp) +{ + uint32_t size; + + chmp->chm_resv_blocks_deletion = 2; + + size = chmp->chm_ebh->flash_size / 50; //2% of flash size + size += chmp->chm_ebh->peb_nr * 100; + size += chmp->chm_ebh->eb_size - 1; + + chmp->chm_resv_blocks_write = + chmp->chm_resv_blocks_deletion + (size / chmp->chm_ebh->eb_size); + chmp->chm_resv_blocks_gctrigger = chmp->chm_resv_blocks_write + 1; + chmp->chm_resv_blocks_gcmerge = chmp->chm_resv_blocks_deletion + 1; + chmp->chm_vdirty_blocks_gctrigger = chmp->chm_resv_blocks_gctrigger * 10; + + chmp->chm_nospc_dirty = + chmp->chm_ebh->eb_size + (chmp->chm_ebh->flash_size / 100); +} + + +/** + * chfs_build_set_vnodecache_nlink - set pvno and nlink in vnodecaches + * @chmp: CHFS main descriptor structure + * @vc: vnode cache + * This function travels @vc's directory entries and sets the pvno and nlink + * attribute of the vnode where the dirent's vno points. + */ +void +chfs_build_set_vnodecache_nlink(struct chfs_mount *chmp, + struct chfs_vnode_cache *vc) +{ + struct chfs_dirent *fd; + //dbg("set nlink\n"); + +// for (fd = vc->scan_dirents; fd; fd = fd->next) { + TAILQ_FOREACH(fd, &vc->scan_dirents, fds) { + struct chfs_vnode_cache *child_vc; + + if (!fd->vno) + continue; + + mutex_enter(&chmp->chm_lock_vnocache); + child_vc = chfs_vnode_cache_get(chmp, fd->vno); + mutex_exit(&chmp->chm_lock_vnocache); + if (!child_vc) { + chfs_mark_node_obsolete(chmp, fd->nref); + continue; + } + if (fd->type == VDIR) { + if (child_vc->nlink < 1) + child_vc->nlink = 1; + + if (child_vc->pvno) { + chfs_err("found a hard link: child dir: %s" + ", (vno: %llu) of dir vno: %llu\n", + fd->name, (unsigned long long)fd->vno, + (unsigned long long)vc->vno); + } else { + //dbg("child_vc->pvno = + // vc->vno; pvno = %d\n", child_vc->pvno); + child_vc->pvno = vc->vno; + } + } + child_vc->nlink++; + //dbg("child_vc->nlink++;\n"); + //child_vc->nlink++; + vc->nlink++; + } +} + +/** + * chfs_build_remove_unlinked vnode + */ +/* static */ +void +chfs_build_remove_unlinked_vnode(struct chfs_mount *chmp, + struct chfs_vnode_cache *vc, +// struct chfs_dirent **unlinked) + struct chfs_dirent_list *unlinked) +{ + struct chfs_node_ref *nref; + struct chfs_dirent *fd, *tmpfd; + + dbg("START\n"); + dbg("vno: %llu\n", (unsigned long long)vc->vno); + + nref = vc->dnode; + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + // The vnode cache is at the end of the data node's chain + while (nref != (struct chfs_node_ref *)vc) { + struct chfs_node_ref *next = nref->nref_next; + dbg("mark dnode\n"); + chfs_mark_node_obsolete(chmp, nref); + nref = next; + } + nref = vc->dirents; + // The vnode cache is at the end of the dirent node's chain + while (nref != (struct chfs_node_ref *)vc) { + struct chfs_node_ref *next = nref->nref_next; + dbg("mark dirent\n"); + chfs_mark_node_obsolete(chmp, nref); + nref = next; + } + if (!TAILQ_EMPTY(&vc->scan_dirents)) { + TAILQ_FOREACH_SAFE(fd, &vc->scan_dirents, fds, tmpfd) { +// while (vc->scan_dirents) { + struct chfs_vnode_cache *child_vc; +// fd = vc->scan_dirents; + dbg("dirent dump:\n"); + dbg(" ->vno: %llu\n", (unsigned long long)fd->vno); + dbg(" ->version: %llu\n", (unsigned long long)fd->version); + dbg(" ->nhash: 0x%x\n", fd->nhash); + dbg(" ->nsize: %d\n", fd->nsize); + dbg(" ->name: %s\n", fd->name); + dbg(" ->type: %d\n", fd->type); +// vc->scan_dirents = fd->next; + TAILQ_REMOVE(&vc->scan_dirents, fd, fds); + + if (!fd->vno) { + chfs_free_dirent(fd); + continue; + } + mutex_enter(&chmp->chm_lock_vnocache); + child_vc = chfs_vnode_cache_get(chmp, fd->vno); + mutex_exit(&chmp->chm_lock_vnocache); + if (!child_vc) { + chfs_free_dirent(fd); + continue; + } + /** + * Decrease nlink in child. If it is 0, add to unlinked + * dirents or just free it otherwise. + */ + child_vc->nlink--; + + if (!child_vc->nlink) { + //dbg("nlink is 0\n"); +// fd->next = *unlinked; +// *unlinked = fd; + // XXX HEAD or TAIL? + // original code did HEAD, but we could add + // it to the TAIL easily with TAILQ. + TAILQ_INSERT_TAIL(unlinked, fd, fds); + } else { + chfs_free_dirent(fd); + } + } + } else { + dbg("there are no scan dirents\n"); + } + + nref = vc->v; + while ((struct chfs_vnode_cache *)nref != vc) { + if (!CHFS_REF_OBSOLETE(nref)) + chfs_mark_node_obsolete(chmp, nref); + nref = nref->nref_next; + } + + mutex_enter(&chmp->chm_lock_vnocache); + if (vc->vno != CHFS_ROOTINO) + chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_UNCHECKED); + mutex_exit(&chmp->chm_lock_vnocache); + dbg("END\n"); +} + +/** + * chfs_build_filesystem - build in-memory representation of filesystem + * @chmp: super block information + * + * Step 1: + * This function scans through the eraseblocks mapped in EBH. + * During scan builds up the map of vnodes and directory entries and puts them + * into the vnode_cache. + * Step 2: + * Scans the directory tree and set the nlink in the vnode caches. + * Step 3: + * Scans vnode caches with nlink = 0 + */ +int +chfs_build_filesystem(struct chfs_mount *chmp) +{ + int i,err = 0; + struct chfs_vnode_cache *vc; + struct chfs_dirent *fd, *tmpfd; +// struct chfs_dirent *unlinked = NULL; + struct chfs_node_ref **nref; + struct chfs_dirent_list unlinked; + struct chfs_vnode_cache *notregvc; + + TAILQ_INIT(&unlinked); + + mutex_enter(&chmp->chm_lock_mountfields); + + /** + * Step 1 + */ + chmp->chm_flags |= CHFS_MP_FLAG_SCANNING; + for (i = 0; i < chmp->chm_ebh->peb_nr; i++) { + //dbg("processing block: %d\n", i); + chmp->chm_blocks[i].lnr = i; + chmp->chm_blocks[i].free_size = chmp->chm_ebh->eb_size; + //If the LEB is add to free list skip it. + if (chmp->chm_ebh->lmap[i] < 0) { + //dbg("block %d is unmapped\n", i); + TAILQ_INSERT_TAIL(&chmp->chm_free_queue, + &chmp->chm_blocks[i], queue); + chmp->chm_nr_free_blocks++; + continue; + } + + err = chfs_scan_eraseblock(chmp, &chmp->chm_blocks[i]); + switch (err) { + case CHFS_BLK_STATE_FREE: + chmp->chm_nr_free_blocks++; + TAILQ_INSERT_TAIL(&chmp->chm_free_queue, + &chmp->chm_blocks[i], queue); + break; + case CHFS_BLK_STATE_CLEAN: + TAILQ_INSERT_TAIL(&chmp->chm_clean_queue, + &chmp->chm_blocks[i], queue); + break; + case CHFS_BLK_STATE_PARTDIRTY: + //dbg("free size: %d\n", chmp->chm_blocks[i].free_size); + if (chmp->chm_blocks[i].free_size > chmp->chm_wbuf_pagesize && + (!chmp->chm_nextblock || + chmp->chm_blocks[i].free_size > + chmp->chm_nextblock->free_size)) { + /* convert the old nextblock's free size to + * dirty and put it on a list */ + if (chmp->chm_nextblock) { + err = chfs_close_eraseblock(chmp, + chmp->chm_nextblock); + if (err) + return err; + } + chmp->chm_nextblock = &chmp->chm_blocks[i]; + } else { + /* convert the scanned block's free size to + * dirty and put it on a list */ + err = chfs_close_eraseblock(chmp, + &chmp->chm_blocks[i]); + if (err) + return err; + } + break; + case CHFS_BLK_STATE_ALLDIRTY: + /* + * The block has a valid EBH header, but it doesn't + * contain any valid data. + */ + TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue, + &chmp->chm_blocks[i], queue); + chmp->chm_nr_erasable_blocks++; + break; + default: + /* It was an error, unknown state */ + break; + } + + } + chmp->chm_flags &= ~CHFS_MP_FLAG_SCANNING; + + + //TODO need bad block check (and bad block handling in EBH too!!) + /* Now EBH only checks block is bad during its scan operation. + * Need check at erase + write + read... + */ + + /** + * Step 2 + */ + chmp->chm_flags |= CHFS_MP_FLAG_BUILDING; + for (i = 0; i < VNODECACHE_SIZE; i++) { + vc = chmp->chm_vnocache_hash[i]; + while (vc) { + dbg("vc->vno: %llu\n", (unsigned long long)vc->vno); + if (!TAILQ_EMPTY(&vc->scan_dirents)) + chfs_build_set_vnodecache_nlink(chmp, vc); + vc = vc->next; + } + } + + /** + * Step 3 + * Scan for vnodes with 0 nlink. + */ + for (i = 0; i < VNODECACHE_SIZE; i++) { + vc = chmp->chm_vnocache_hash[i]; + while (vc) { + if (vc->nlink) { + vc = vc->next; + continue; + } + + //dbg("remove unlinked start i: %d\n", i); + chfs_build_remove_unlinked_vnode(chmp, + vc, &unlinked); + //dbg("remove unlinked end\n"); + vc = vc->next; + } + } + /* Remove the newly unlinked vnodes. They are on the unlinked list */ + TAILQ_FOREACH_SAFE(fd, &unlinked, fds, tmpfd) { +// while (unlinked) { +// fd = unlinked; +// unlinked = fd->next; + TAILQ_REMOVE(&unlinked, fd, fds); + mutex_enter(&chmp->chm_lock_vnocache); + vc = chfs_vnode_cache_get(chmp, fd->vno); + mutex_exit(&chmp->chm_lock_vnocache); + if (vc) { + chfs_build_remove_unlinked_vnode(chmp, + vc, &unlinked); + } + chfs_free_dirent(fd); + } + + chmp->chm_flags &= ~CHFS_MP_FLAG_BUILDING; + + /* Free all dirents */ + for (i = 0; i < VNODECACHE_SIZE; i++) { + vc = chmp->chm_vnocache_hash[i]; + while (vc) { + TAILQ_FOREACH_SAFE(fd, &vc->scan_dirents, fds, tmpfd) { +// while (vc->scan_dirents) { +// fd = vc->scan_dirents; +// vc->scan_dirents = fd->next; + TAILQ_REMOVE(&vc->scan_dirents, fd, fds); + if (fd->vno == 0) { + //for (nref = &vc->dirents; + // *nref != fd->nref; + // nref = &((*nref)->next)); + + nref = &fd->nref; + *nref = fd->nref->nref_next; + //fd->nref->nref_next = NULL; + } else if (fd->type == VDIR) { + //set state every non-VREG file's vc + mutex_enter(&chmp->chm_lock_vnocache); + notregvc = + chfs_vnode_cache_get(chmp, + fd->vno); + chfs_vnode_cache_set_state(chmp, + notregvc, VNO_STATE_PRESENT); + mutex_exit(&chmp->chm_lock_vnocache); + } + chfs_free_dirent(fd); + } +// vc->scan_dirents = NULL; + KASSERT(TAILQ_EMPTY(&vc->scan_dirents)); + vc = vc->next; + } + } + + //Set up chmp->chm_wbuf_ofs for the first write + if (chmp->chm_nextblock) { + dbg("free_size: %d\n", chmp->chm_nextblock->free_size); + chmp->chm_wbuf_ofs = chmp->chm_ebh->eb_size - + chmp->chm_nextblock->free_size; + } else { + chmp->chm_wbuf_ofs = 0xffffffff; + } + mutex_exit(&chmp->chm_lock_mountfields); + + return 0; +} + diff --git a/sys/ufs/chfs/chfs_erase.c b/sys/ufs/chfs/chfs_erase.c new file mode 100644 index 000000000..9ae49c37c --- /dev/null +++ b/sys/ufs/chfs/chfs_erase.c @@ -0,0 +1,137 @@ +/* $NetBSD: chfs_erase.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * Copyright (c) 2010 David Tengeri + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * chfs_erase.c + * + * Copyright (C) 2010 David Tengeri , + * ... + * University of Szeged, Hungary + */ + +#include "chfs.h" + + +/** + * chfs_remap_leb - unmap and then map a leb + * @chmp: chfs mount structure + * + * This function gets an eraseblock from the erasable queue, unmaps it through + * EBH and maps another eraseblock to the same LNR. + * EBH will find a free eraseblock if any or will erase one if there isn't any + * free, just dirty block. + * + * Returns zero on case of success, errorcode otherwise. + * + * Needs more brainstorming here. + */ +int +chfs_remap_leb(struct chfs_mount *chmp) +{ + int err; + struct chfs_eraseblock *cheb; + dbg("chfs_remap_leb\n"); + uint32_t dirty, unchecked, used, free, wasted; + + //dbg("chmp->chm_nr_erasable_blocks: %d\n", chmp->chm_nr_erasable_blocks); + //dbg("ltree: %p ecl: %p\n", &chmp->chm_ebh->ltree_lock, &chmp->chm_lock_sizes); + KASSERT(!rw_write_held(&chmp->chm_lock_wbuf)); + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + KASSERT(mutex_owned(&chmp->chm_lock_sizes)); + + if (!chmp->chm_nr_erasable_blocks) { + //TODO + /* We don't have any erasable blocks, need to check if there are + * blocks on erasable_pending_wbuf_queue, flush the data and then + * we can remap it. + * If there aren't any blocks on that list too, we need to GC? + */ + if (!TAILQ_EMPTY(&chmp->chm_erasable_pending_wbuf_queue)) { + cheb = TAILQ_FIRST(&chmp->chm_erasable_pending_wbuf_queue); + TAILQ_REMOVE(&chmp->chm_erasable_pending_wbuf_queue, cheb, queue); + if (chmp->chm_wbuf_len) { + mutex_exit(&chmp->chm_lock_sizes); + chfs_flush_pending_wbuf(chmp); + mutex_enter(&chmp->chm_lock_sizes); + } + TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue, cheb, queue); + chmp->chm_nr_erasable_blocks++; + } else { + /* We can't delete any block. */ + //FIXME should we return ENOSPC? + return ENOSPC; + } + } + cheb = TAILQ_FIRST(&chmp->chm_erase_pending_queue); + TAILQ_REMOVE(&chmp->chm_erase_pending_queue, cheb, queue); + chmp->chm_nr_erasable_blocks--; + + dirty = cheb->dirty_size; + unchecked = cheb->unchecked_size; + used = cheb->used_size; + free = cheb->free_size; + wasted = cheb->wasted_size; + + // Free allocated node references for this eraseblock + chfs_free_node_refs(cheb); + + err = chfs_unmap_leb(chmp, cheb->lnr); + if (err) + return err; + + err = chfs_map_leb(chmp, cheb->lnr); + if (err) + return err; + // Reset state to default and change chmp sizes too + chfs_change_size_dirty(chmp, cheb, -dirty); + chfs_change_size_unchecked(chmp, cheb, -unchecked); + chfs_change_size_used(chmp, cheb, -used); + chfs_change_size_free(chmp, cheb, chmp->chm_ebh->eb_size - free); + chfs_change_size_wasted(chmp, cheb, -wasted); + + KASSERT(cheb->dirty_size == 0); + KASSERT(cheb->unchecked_size == 0); + KASSERT(cheb->used_size == 0); + KASSERT(cheb->free_size == chmp->chm_ebh->eb_size); + KASSERT(cheb->wasted_size == 0); + + cheb->first_node = NULL; + cheb->last_node = NULL; + //put it to free_queue + TAILQ_INSERT_TAIL(&chmp->chm_free_queue, cheb, queue); + chmp->chm_nr_free_blocks++; + dbg("remaped (free: %d, erasable: %d)\n", chmp->chm_nr_free_blocks, chmp->chm_nr_erasable_blocks); + KASSERT(!TAILQ_EMPTY(&chmp->chm_free_queue)); + + return 0; +} diff --git a/sys/ufs/chfs/chfs_gc.c b/sys/ufs/chfs/chfs_gc.c new file mode 100644 index 000000000..aa32d64b9 --- /dev/null +++ b/sys/ufs/chfs/chfs_gc.c @@ -0,0 +1,1238 @@ +/* $NetBSD: chfs_gc.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * Copyright (c) 2010 Tamas Toth + * Copyright (c) 2010 Adam Hoka + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "chfs.h" + +void chfs_gc_release_inode(struct chfs_mount *, + struct chfs_inode *); +struct chfs_inode *chfs_gc_fetch_inode(struct chfs_mount *, + ino_t, uint32_t); +int chfs_check(struct chfs_mount *, struct chfs_vnode_cache *); +void chfs_clear_inode(struct chfs_mount *, struct chfs_inode *); + + +struct chfs_eraseblock *find_gc_block(struct chfs_mount *); +int chfs_gcollect_pristine(struct chfs_mount *, + struct chfs_eraseblock *, + struct chfs_vnode_cache *, struct chfs_node_ref *); +int chfs_gcollect_live(struct chfs_mount *, + struct chfs_eraseblock *, struct chfs_node_ref *, + struct chfs_inode *); +int chfs_gcollect_vnode(struct chfs_mount *, struct chfs_inode *); +int chfs_gcollect_dirent(struct chfs_mount *, + struct chfs_eraseblock *, struct chfs_inode *, + struct chfs_dirent *); +int chfs_gcollect_deletion_dirent(struct chfs_mount *, + struct chfs_eraseblock *, struct chfs_inode *, + struct chfs_dirent *); +int chfs_gcollect_dnode(struct chfs_mount *, + struct chfs_eraseblock *, struct chfs_inode *, + struct chfs_full_dnode *, uint32_t, uint32_t); + +/* must be called with chm_lock_mountfields held */ +void +chfs_gc_trigger(struct chfs_mount *chmp) +{ + struct garbage_collector_thread *gc = &chmp->chm_gc_thread; + + //mutex_enter(&chmp->chm_lock_sizes); + if (gc->gcth_running && + chfs_gc_thread_should_wake(chmp)) { + cv_signal(&gc->gcth_wakeup); + } + //mutex_exit(&chmp->chm_lock_sizes); +} + + +void +chfs_gc_thread(void *data) +{ + struct chfs_mount *chmp = data; + struct garbage_collector_thread *gc = &chmp->chm_gc_thread; + + dbg_gc("[GC THREAD] thread started\n"); + + mutex_enter(&chmp->chm_lock_mountfields); + while (gc->gcth_running) { + /* we must call chfs_gc_thread_should_wake with chm_lock_mountfields + * held, which is a bit awkwardly done here, but we cant relly + * do it otherway with the current design... + */ + if (chfs_gc_thread_should_wake(chmp)) { +// mutex_exit(&chmp->chm_lock_mountfields); + if (chfs_gcollect_pass(chmp) == ENOSPC) { + dbg_gc("No space for garbage collection\n"); + panic("No space for garbage collection\n"); + /* XXX why break here? i have added a panic + * here to see if it gets triggered -ahoka + */ + break; + } + /* XXX gcollect_pass drops the mutex */ + mutex_enter(&chmp->chm_lock_mountfields); + } + + cv_timedwait_sig(&gc->gcth_wakeup, + &chmp->chm_lock_mountfields, mstohz(100)); + } + mutex_exit(&chmp->chm_lock_mountfields); + + dbg_gc("[GC THREAD] thread stopped\n"); + kthread_exit(0); +} + +void +chfs_gc_thread_start(struct chfs_mount *chmp) +{ + struct garbage_collector_thread *gc = &chmp->chm_gc_thread; + + cv_init(&gc->gcth_wakeup, "chfsgccv"); + + gc->gcth_running = true; + kthread_create(PRI_NONE, /*KTHREAD_MPSAFE |*/ KTHREAD_MUSTJOIN, + NULL, chfs_gc_thread, chmp, &gc->gcth_thread, + "chfsgcth"); +} + +void +chfs_gc_thread_stop(struct chfs_mount *chmp) +{ + struct garbage_collector_thread *gc = &chmp->chm_gc_thread; + + /* check if it is actually running. if not, do nothing */ + if (gc->gcth_running) { + gc->gcth_running = false; + } else { + return; + } + cv_signal(&gc->gcth_wakeup); + dbg_gc("[GC THREAD] stop signal sent\n"); + + kthread_join(gc->gcth_thread); +#ifdef BROKEN_KTH_JOIN + kpause("chfsthjoin", false, mstohz(1000), NULL); +#endif + + cv_destroy(&gc->gcth_wakeup); +} + +/* must be called with chm_lock_mountfields held */ +int +chfs_gc_thread_should_wake(struct chfs_mount *chmp) +{ + int nr_very_dirty = 0; + struct chfs_eraseblock *cheb; + uint32_t dirty; + + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) { + dbg_gc("erase_pending\n"); + return 1; + } + + if (chmp->chm_unchecked_size) { + dbg_gc("unchecked\n"); + return 1; + } + + dirty = chmp->chm_dirty_size - chmp->chm_nr_erasable_blocks * + chmp->chm_ebh->eb_size; + + if (chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks < + chmp->chm_resv_blocks_gctrigger && (dirty > chmp->chm_nospc_dirty)) { + dbg_gc("free: %d + erasable: %d < resv: %d\n", + chmp->chm_nr_free_blocks, chmp->chm_nr_erasable_blocks, + chmp->chm_resv_blocks_gctrigger); + dbg_gc("dirty: %d > nospc_dirty: %d\n", + dirty, chmp->chm_nospc_dirty); + + return 1; + } + + TAILQ_FOREACH(cheb, &chmp->chm_very_dirty_queue, queue) { + nr_very_dirty++; + if (nr_very_dirty == chmp->chm_vdirty_blocks_gctrigger) { + dbg_gc("nr_very_dirty\n"); + return 1; + } + } + + return 0; +} + +void +chfs_gc_release_inode(struct chfs_mount *chmp, + struct chfs_inode *ip) +{ + dbg_gc("release inode\n"); + //mutex_exit(&ip->inode_lock); + //vput(ITOV(ip)); +} + +struct chfs_inode * +chfs_gc_fetch_inode(struct chfs_mount *chmp, ino_t vno, + uint32_t unlinked) +{ + struct vnode *vp = NULL; + struct chfs_vnode_cache *vc; + struct chfs_inode *ip; + dbg_gc("fetch inode %llu\n", (unsigned long long)vno); + + if (unlinked) { + dbg_gc("unlinked\n"); + vp = chfs_vnode_lookup(chmp, vno); + if (!vp) { + mutex_enter(&chmp->chm_lock_vnocache); + vc = chfs_vnode_cache_get(chmp, vno); + if (!vc) { + mutex_exit(&chmp->chm_lock_vnocache); + return NULL; + } + if (vc->state != VNO_STATE_CHECKEDABSENT) { + //sleep_on_spinunlock(&chmp->chm_lock_vnocache); + mutex_exit(&chmp->chm_lock_vnocache); + /* XXX why do we need the delay here?! */ +// kpause("chvncabs", true, mstohz(50), NULL); + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + cv_timedwait_sig( + &chmp->chm_gc_thread.gcth_wakeup, + &chmp->chm_lock_mountfields, mstohz(50)); + +// KASSERT(!mutex_owned(&chmp->chm_lock_vnocache)); + } else { + mutex_exit(&chmp->chm_lock_vnocache); + } + return NULL; + } + } else { + dbg_gc("vnode lookup\n"); + vp = chfs_vnode_lookup(chmp, vno); + //VFS_VGET(chmp->chm_fsmp, vno, &vp); + } + dbg_gc("vp to ip\n"); + ip = VTOI(vp); + KASSERT(ip); + //mutex_enter(&ip->inode_lock); + + return ip; +} + +extern rb_tree_ops_t frag_rbtree_ops; + +int +chfs_check(struct chfs_mount *chmp, struct chfs_vnode_cache *chvc) +{ + struct chfs_inode *ip; + struct vnode *vp; + int ret; + + ip = pool_get(&chfs_inode_pool, PR_WAITOK); + if (!ip) { + return ENOMEM; + } + + vp = kmem_zalloc(sizeof(struct vnode), KM_SLEEP); + + ip->chvc = chvc; + ip->vp = vp; + + vp->v_data = ip; + + rb_tree_init(&ip->fragtree, &frag_rbtree_ops); + TAILQ_INIT(&ip->dents); + + ret = chfs_read_inode_internal(chmp, ip); + if (!ret) { + chfs_clear_inode(chmp, ip); + } + + pool_put(&chfs_inode_pool, ip); + + return ret; +} + +void +chfs_clear_inode(struct chfs_mount *chmp, struct chfs_inode *ip) +{ + struct chfs_dirent *fd, *tmpfd; + struct chfs_vnode_cache *chvc; + + + /* XXX not sure if this is the correct locking */ +// mutex_enter(&chmp->chm_lock_vnocache); + chvc = ip->chvc; + /* shouldnt this be: */ + //bool deleted = (chvc && !(chvc->pvno || chvc->nlink)); + int deleted = (chvc && !(chvc->pvno | chvc->nlink)); + + if (chvc && chvc->state != VNO_STATE_CHECKING) { +// chfs_vnode_cache_state_set(chmp, chvc, VNO_STATE_CLEARING); + chvc->state = VNO_STATE_CLEARING; + } + + if (chvc->v && ((struct chfs_vnode_cache *)chvc->v != chvc)) { + if (deleted) + chfs_mark_node_obsolete(chmp, chvc->v); + //chfs_free_refblock(chvc->v); + } +// mutex_enter(&chmp->chm_lock_vnocache); + + chfs_kill_fragtree(&ip->fragtree); +/* + fd = TAILQ_FIRST(&ip->dents); + while (fd) { + TAILQ_REMOVE(&ip->dents, fd, fds); + chfs_free_dirent(fd); + fd = TAILQ_FIRST(&ip->dents); + } +*/ + + TAILQ_FOREACH_SAFE(fd, &ip->dents, fds, tmpfd) { + chfs_free_dirent(fd); + } + + if (chvc && chvc->state == VNO_STATE_CHECKING) { + chfs_vnode_cache_set_state(chmp, + chvc, VNO_STATE_CHECKEDABSENT); + if ((struct chfs_vnode_cache *)chvc->v == chvc && + (struct chfs_vnode_cache *)chvc->dirents == chvc && + (struct chfs_vnode_cache *)chvc->dnode == chvc) + chfs_vnode_cache_remove(chmp, chvc); + } + +} + +struct chfs_eraseblock * +find_gc_block(struct chfs_mount *chmp) +{ + struct chfs_eraseblock *ret; + struct chfs_eraseblock_queue *nextqueue; + + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + struct timespec now; + vfs_timestamp(&now); + + int n = now.tv_nsec % 128; + + //dbg_gc("n = %d\n", n); +again: +/* if (!TAILQ_EMPTY(&chmp->chm_bad_used_queue) && chmp->chm_nr_free_blocks > chmp->chm_nr_resv_blocks_gcbad) { + dbg_gc("Picking block from bad_used_queue to GC next\n"); + nextqueue = &chmp->chm_bad_used_queue; + } else */if (n<50 && !TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) { + dbg_gc("Picking block from erase_pending_queue to GC next\n"); + nextqueue = &chmp->chm_erase_pending_queue; + } else if (n<110 && !TAILQ_EMPTY(&chmp->chm_very_dirty_queue) ) { + dbg_gc("Picking block from very_dirty_queue to GC next\n"); + nextqueue = &chmp->chm_very_dirty_queue; + } else if (n<126 && !TAILQ_EMPTY(&chmp->chm_dirty_queue) ) { + dbg_gc("Picking block from dirty_queue to GC next\n"); + nextqueue = &chmp->chm_dirty_queue; + } else if (!TAILQ_EMPTY(&chmp->chm_clean_queue)) { + dbg_gc("Picking block from clean_queue to GC next\n"); + nextqueue = &chmp->chm_clean_queue; + } else if (!TAILQ_EMPTY(&chmp->chm_dirty_queue)) { + dbg_gc("Picking block from dirty_queue to GC next" + " (clean_queue was empty)\n"); + nextqueue = &chmp->chm_dirty_queue; + } else if (!TAILQ_EMPTY(&chmp->chm_very_dirty_queue)) { + dbg_gc("Picking block from very_dirty_queue to GC next" + " (clean_queue and dirty_queue were empty)\n"); + nextqueue = &chmp->chm_very_dirty_queue; + } else if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) { + dbg_gc("Picking block from erase_pending_queue to GC next" + " (clean_queue and {very_,}dirty_queue were empty)\n"); + nextqueue = &chmp->chm_erase_pending_queue; + } else if (!TAILQ_EMPTY(&chmp->chm_erasable_pending_wbuf_queue)) { + dbg_gc("Synching wbuf in order to reuse " + "erasable_pendig_wbuf_queue blocks\n"); + rw_enter(&chmp->chm_lock_wbuf, RW_WRITER); + chfs_flush_pending_wbuf(chmp); + rw_exit(&chmp->chm_lock_wbuf); + goto again; + } else { + dbg_gc("CHFS: no clean, dirty _or_ erasable" + " blocks to GC from! Where are they all?\n"); + return NULL; + } + + ret = TAILQ_FIRST(nextqueue); + if (chmp->chm_nextblock) { + dbg_gc("nextblock num: %u - gcblock num: %u\n", + chmp->chm_nextblock->lnr, ret->lnr); + if (ret == chmp->chm_nextblock) + goto again; + //KASSERT(ret != chmp->chm_nextblock); + //dbg_gc("first node lnr: %u ofs: %u\n", ret->first_node->lnr, ret->first_node->offset); + //dbg_gc("last node lnr: %u ofs: %u\n", ret->last_node->lnr, ret->last_node->offset); + } + TAILQ_REMOVE(nextqueue, ret, queue); + chmp->chm_gcblock = ret; + ret->gc_node = ret->first_node; + + if (!ret->gc_node) { + dbg_gc("Oops! ret->gc_node at LEB: %u is NULL\n", ret->lnr); + panic("CHFS BUG - one LEB's gc_node is NULL\n"); + } + + /* TODO wasted size? */ + return ret; +} + + +int +chfs_gcollect_pass(struct chfs_mount *chmp) +{ + struct chfs_vnode_cache *vc; + struct chfs_eraseblock *eb; + struct chfs_node_ref *nref; + uint32_t gcblock_dirty; + struct chfs_inode *ip; + ino_t vno, pvno; + uint32_t nlink; + int ret = 0; + + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + +// mutex_enter(&chmp->chm_lock_mountfields); + for (;;) { + mutex_enter(&chmp->chm_lock_sizes); + + dbg_gc("unchecked size == %u\n", chmp->chm_unchecked_size); + if (!chmp->chm_unchecked_size) + break; + + if (chmp->chm_checked_vno > chmp->chm_max_vno) { + mutex_exit(&chmp->chm_lock_sizes); + mutex_exit(&chmp->chm_lock_mountfields); + dbg_gc("checked_vno (#%llu) > max_vno (#%llu)\n", + (unsigned long long)chmp->chm_checked_vno, + (unsigned long long)chmp->chm_max_vno); + return ENOSPC; + } + + mutex_exit(&chmp->chm_lock_sizes); + + mutex_enter(&chmp->chm_lock_vnocache); + dbg_gc("checking vno #%llu\n", + (unsigned long long)chmp->chm_checked_vno); + dbg_gc("get vnode cache\n"); + vc = chfs_vnode_cache_get(chmp, chmp->chm_checked_vno++); + + if (!vc) { + dbg_gc("!vc\n"); + mutex_exit(&chmp->chm_lock_vnocache); + continue; + } + + if ((vc->pvno | vc->nlink) == 0) { + dbg_gc("(pvno | nlink) == 0\n"); + mutex_exit(&chmp->chm_lock_vnocache); + continue; + } + + dbg_gc("switch\n"); + switch (vc->state) { + case VNO_STATE_CHECKEDABSENT: + case VNO_STATE_PRESENT: + mutex_exit(&chmp->chm_lock_vnocache); + continue; + + case VNO_STATE_GC: + case VNO_STATE_CHECKING: + mutex_exit(&chmp->chm_lock_vnocache); + mutex_exit(&chmp->chm_lock_mountfields); + dbg_gc("VNO_STATE GC or CHECKING\n"); + panic("CHFS BUG - vc state gc or checking\n"); + + case VNO_STATE_READING: + chmp->chm_checked_vno--; + mutex_exit(&chmp->chm_lock_vnocache); + /* XXX why do we need the delay here?! */ + kpause("chvncrea", true, mstohz(50), NULL); + +// sleep_on_spinunlock(&chmp->chm_lock_vnocache); +// KASSERT(!mutex_owned(&chmp->chm_lock_vnocache)); + mutex_exit(&chmp->chm_lock_mountfields); + return 0; + + default: + mutex_exit(&chmp->chm_lock_vnocache); + mutex_exit(&chmp->chm_lock_mountfields); + dbg_gc("default\n"); + panic("CHFS BUG - vc state is other what we" + " checked\n"); + + case VNO_STATE_UNCHECKED: + ; + } + + chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_CHECKING); + + /* XXX check if this is too heavy to call under + * chm_lock_vnocache + */ + ret = chfs_check(chmp, vc); + dbg_gc("set state\n"); + chfs_vnode_cache_set_state(chmp, + vc, VNO_STATE_CHECKEDABSENT); + + mutex_exit(&chmp->chm_lock_vnocache); + mutex_exit(&chmp->chm_lock_mountfields); + + return ret; + } + + + eb = chmp->chm_gcblock; + + if (!eb) { + eb = find_gc_block(chmp); + } + + if (!eb) { + dbg_gc("!eb\n"); + if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) { + mutex_exit(&chmp->chm_lock_sizes); + mutex_exit(&chmp->chm_lock_mountfields); + return EAGAIN; + } + mutex_exit(&chmp->chm_lock_sizes); + mutex_exit(&chmp->chm_lock_mountfields); + return EIO; + } + + if (!eb->used_size) { + dbg_gc("!eb->used_size\n"); + goto eraseit; + } + + nref = eb->gc_node; + //dbg_gc("gc use: %u\n", chmp->chm_nextblock->lnr); + //dbg_gc("nref: %u %u\n", nref->nref_lnr, nref->nref_offset); + gcblock_dirty = eb->dirty_size; + + while(CHFS_REF_OBSOLETE(nref)) { + //dbg_gc("obsoleted nref lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset); +#ifdef DBG_MSG_GC + if (nref == chmp->chm_blocks[nref->nref_lnr].last_node) { + dbg_gc("THIS NODE IS THE LAST NODE OF ITS EB\n"); + } +#endif + nref = node_next(nref); + if (!nref) { + //dbg_gc("!nref\n"); + eb->gc_node = nref; + mutex_exit(&chmp->chm_lock_sizes); + mutex_exit(&chmp->chm_lock_mountfields); + panic("CHFS BUG - nref is NULL)\n"); + } + } + eb->gc_node = nref; + //dbg_gc("nref the chosen one lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset); + KASSERT(nref->nref_lnr == chmp->chm_gcblock->lnr); + + if (!nref->nref_next) { + //dbg_gc("!nref->nref_next\n"); + mutex_exit(&chmp->chm_lock_sizes); + if (CHFS_REF_FLAGS(nref) == CHFS_PRISTINE_NODE_MASK) { + chfs_gcollect_pristine(chmp, eb, NULL, nref); + } else { + chfs_mark_node_obsolete(chmp, nref); + } + goto lock_size; + } + + dbg_gc("nref lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset); + vc = chfs_nref_to_vc(nref); + + mutex_exit(&chmp->chm_lock_sizes); + + //dbg_gc("enter vnocache lock on #%llu\n", vc->vno); + mutex_enter(&chmp->chm_lock_vnocache); + + dbg_gc("switch\n"); + switch(vc->state) { + case VNO_STATE_CHECKEDABSENT: + if (CHFS_REF_FLAGS(nref) == CHFS_PRISTINE_NODE_MASK) { + chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_GC); + } + break; + + case VNO_STATE_PRESENT: + break; + + case VNO_STATE_UNCHECKED: + case VNO_STATE_CHECKING: + case VNO_STATE_GC: + mutex_exit(&chmp->chm_lock_vnocache); + mutex_exit(&chmp->chm_lock_mountfields); + panic("CHFS BUG - vc state unchecked," + " checking or gc (vno #%llu, num #%d)\n", + (unsigned long long)vc->vno, vc->state); + + case VNO_STATE_READING: + mutex_exit(&chmp->chm_lock_vnocache); + /* XXX why do we need the delay here?! */ + kpause("chvncrea", true, mstohz(50), NULL); + +// sleep_on_spinunlock(&chmp->chm_lock_vnocache); +// KASSERT(!mutex_owned(&chmp->chm_lock_vnocache)); + mutex_exit(&chmp->chm_lock_mountfields); + return 0; + } + + if (vc->state == VNO_STATE_GC) { + dbg_gc("vc->state == VNO_STATE_GC\n"); + mutex_exit(&chmp->chm_lock_vnocache); + ret = chfs_gcollect_pristine(chmp, eb, NULL, nref); + +// chfs_vnode_cache_state_set(chmp, +// vc, VNO_STATE_CHECKEDABSENT); + /* XXX locking? */ + vc->state = VNO_STATE_CHECKEDABSENT; + //TODO wake_up(&chmp->chm_vnocache_wq); + if (ret != EBADF) + goto test_gcnode; + mutex_enter(&chmp->chm_lock_vnocache); + } + + vno = vc->vno; + pvno = vc->pvno; + nlink = vc->nlink; + mutex_exit(&chmp->chm_lock_vnocache); + + ip = chfs_gc_fetch_inode(chmp, vno, !(pvno | nlink)); + + if (!ip) { + dbg_gc("!ip\n"); + ret = 0; + goto lock_size; + } + + chfs_gcollect_live(chmp, eb, nref, ip); + + chfs_gc_release_inode(chmp, ip); + +test_gcnode: + if (eb->dirty_size == gcblock_dirty && + !CHFS_REF_OBSOLETE(eb->gc_node)) { + dbg_gc("ERROR collecting node at %u failed.\n", + CHFS_GET_OFS(eb->gc_node->nref_offset)); + + ret = ENOSPC; + } + +lock_size: + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + mutex_enter(&chmp->chm_lock_sizes); +eraseit: + dbg_gc("eraseit\n"); + + if (chmp->chm_gcblock) { + dbg_gc("eb used size = %u\n", chmp->chm_gcblock->used_size); + dbg_gc("eb free size = %u\n", chmp->chm_gcblock->free_size); + dbg_gc("eb dirty size = %u\n", chmp->chm_gcblock->dirty_size); + dbg_gc("eb unchecked size = %u\n", + chmp->chm_gcblock->unchecked_size); + dbg_gc("eb wasted size = %u\n", chmp->chm_gcblock->wasted_size); + + KASSERT(chmp->chm_gcblock->used_size + chmp->chm_gcblock->free_size + + chmp->chm_gcblock->dirty_size + + chmp->chm_gcblock->unchecked_size + + chmp->chm_gcblock->wasted_size == chmp->chm_ebh->eb_size); + + } + + if (chmp->chm_gcblock && chmp->chm_gcblock->dirty_size + + chmp->chm_gcblock->wasted_size == chmp->chm_ebh->eb_size) { + dbg_gc("Block at leb #%u completely obsoleted by GC, " + "Moving to erase_pending_queue\n", chmp->chm_gcblock->lnr); + TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue, + chmp->chm_gcblock, queue); + chmp->chm_gcblock = NULL; + chmp->chm_nr_erasable_blocks++; + if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) { + ret = chfs_remap_leb(chmp); + } + } + + mutex_exit(&chmp->chm_lock_sizes); + mutex_exit(&chmp->chm_lock_mountfields); + dbg_gc("return\n"); + return ret; +} + + +int +chfs_gcollect_pristine(struct chfs_mount *chmp, struct chfs_eraseblock *cheb, + struct chfs_vnode_cache *chvc, struct chfs_node_ref *nref) +{ + struct chfs_node_ref *newnref; + struct chfs_flash_node_hdr *nhdr; + struct chfs_flash_vnode *fvnode; + struct chfs_flash_dirent_node *fdirent; + struct chfs_flash_data_node *fdata; + int ret, retries = 0; + uint32_t ofs, crc; + size_t totlen = chfs_nref_len(chmp, cheb, nref); + char *data; + struct iovec vec; + size_t retlen; + + dbg_gc("gcollect_pristine\n"); + + data = kmem_alloc(totlen, KM_SLEEP); + if (!data) + return ENOMEM; + + ofs = CHFS_GET_OFS(nref->nref_offset); + + ret = chfs_read_leb(chmp, nref->nref_lnr, data, ofs, totlen, &retlen); + if (ret) { + dbg_gc("reading error\n"); + return ret; + } + if (retlen != totlen) { + dbg_gc("read size error\n"); + return EIO; + } + nhdr = (struct chfs_flash_node_hdr *)data; + /* check the header */ + if (le16toh(nhdr->magic) != CHFS_FS_MAGIC_BITMASK) { + dbg_gc("node header magic number error\n"); + return EBADF; + } + crc = crc32(0, (uint8_t *)nhdr, CHFS_NODE_HDR_SIZE - 4); + if (crc != le32toh(nhdr->hdr_crc)) { + dbg_gc("node header crc error\n"); + return EBADF; + } + + switch(le16toh(nhdr->type)) { + case CHFS_NODETYPE_VNODE: + fvnode = (struct chfs_flash_vnode *)data; + crc = crc32(0, (uint8_t *)fvnode, sizeof(struct chfs_flash_vnode) - 4); + if (crc != le32toh(fvnode->node_crc)) { + dbg_gc("vnode crc error\n"); + return EBADF; + } + break; + case CHFS_NODETYPE_DIRENT: + fdirent = (struct chfs_flash_dirent_node *)data; + crc = crc32(0, (uint8_t *)fdirent, sizeof(struct chfs_flash_dirent_node) - 4); + if (crc != le32toh(fdirent->node_crc)) { + dbg_gc("dirent crc error\n"); + return EBADF; + } + crc = crc32(0, fdirent->name, fdirent->nsize); + if (crc != le32toh(fdirent->name_crc)) { + dbg_gc("dirent name crc error\n"); + return EBADF; + } + break; + case CHFS_NODETYPE_DATA: + fdata = (struct chfs_flash_data_node *)data; + crc = crc32(0, (uint8_t *)fdata, sizeof(struct chfs_flash_data_node) - 4); + if (crc != le32toh(fdata->node_crc)) { + dbg_gc("data node crc error\n"); + return EBADF; + } + break; + default: + if (chvc) { + dbg_gc("unknown node have vnode cache\n"); + return EBADF; + } + } + /* CRC's OK, write node to its new place */ +retry: + ret = chfs_reserve_space_gc(chmp, totlen); + if (ret) + return ret; + + newnref = chfs_alloc_node_ref(chmp->chm_nextblock); + if (!newnref) + return ENOMEM; + + ofs = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size; + newnref->nref_offset = ofs; + + vec.iov_base = (void *)data; + vec.iov_len = totlen; + mutex_enter(&chmp->chm_lock_sizes); + ret = chfs_write_wbuf(chmp, &vec, 1, ofs, &retlen); + + if (ret || retlen != totlen) { + chfs_err("error while writing out to the media\n"); + chfs_err("err: %d | size: %zu | retlen : %zu\n", + ret, totlen, retlen); + + chfs_change_size_dirty(chmp, chmp->chm_nextblock, totlen); + if (retries) { + mutex_exit(&chmp->chm_lock_sizes); + return EIO; + } + + retries++; + mutex_exit(&chmp->chm_lock_sizes); + goto retry; + } + + mutex_exit(&chmp->chm_lock_sizes); + //TODO should we set free_size? + chfs_mark_node_obsolete(chmp, nref); + chfs_add_vnode_ref_to_vc(chmp, chvc, newnref); + return 0; +} + + +int +chfs_gcollect_live(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb, struct chfs_node_ref *nref, + struct chfs_inode *ip) +{ + struct chfs_node_frag *frag; + struct chfs_full_dnode *fn = NULL; + int start = 0, end = 0, nrfrags = 0; + struct chfs_dirent *fd = NULL; + int ret = 0; + bool is_dirent; + + dbg_gc("gcollect_live\n"); + + if (chmp->chm_gcblock != cheb) { + dbg_gc("GC block is no longer gcblock. Restart.\n"); + goto upnout; + } + + if (CHFS_REF_OBSOLETE(nref)) { + dbg_gc("node to be GC'd was obsoleted in the meantime.\n"); + goto upnout; + } + + /* It's a vnode? */ + if (ip->chvc->v == nref) { + chfs_gcollect_vnode(chmp, ip); + goto upnout; + } + + /* find fn */ + dbg_gc("find full dnode\n"); + for(frag = frag_first(&ip->fragtree); + frag; frag = frag_next(&ip->fragtree, frag)) { + if (frag->node && frag->node->nref == nref) { + fn = frag->node; + end = frag->ofs + frag->size; + if (!nrfrags++) + start = frag->ofs; + if (nrfrags == frag->node->frags) + break; + } + } + + /* It's a pristine node, or dnode (or hole? XXX have we hole nodes?) */ + if (fn) { + if (CHFS_REF_FLAGS(nref) == CHFS_PRISTINE_NODE_MASK) { + ret = chfs_gcollect_pristine(chmp, + cheb, ip->chvc, nref); + if (!ret) { + frag->node->nref = ip->chvc->v; + } + if (ret != EBADF) + goto upnout; + } + //ret = chfs_gcollect_hole(chmp, cheb, ip, fn, start, end); + ret = chfs_gcollect_dnode(chmp, cheb, ip, fn, start, end); + goto upnout; + } + + + /* It's a dirent? */ + dbg_gc("find full dirent\n"); + is_dirent = false; + TAILQ_FOREACH(fd, &ip->dents, fds) { + if (fd->nref == nref) { + is_dirent = true; + break; + } + } + + if (is_dirent && fd->vno) { + ret = chfs_gcollect_dirent(chmp, cheb, ip, fd); + } else if (is_dirent) { + ret = chfs_gcollect_deletion_dirent(chmp, cheb, ip, fd); + } else { + dbg_gc("Nref at leb #%u offset 0x%08x wasn't in node list" + " for ino #%llu\n", + nref->nref_lnr, CHFS_GET_OFS(nref->nref_offset), + (unsigned long long)ip->ino); + if (CHFS_REF_OBSOLETE(nref)) { + dbg_gc("But it's obsolete so we don't mind" + " too much.\n"); + } + } + +upnout: + return ret; +} + +int +chfs_gcollect_vnode(struct chfs_mount *chmp, struct chfs_inode *ip) +{ + int ret; + dbg_gc("gcollect_vnode\n"); + + ret = chfs_write_flash_vnode(chmp, ip, ALLOC_GC); + + return ret; +} + +int +chfs_gcollect_dirent(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb, struct chfs_inode *parent, + struct chfs_dirent *fd) +{ + struct vnode *vnode = NULL; + struct chfs_inode *ip; + struct chfs_node_ref *prev; + dbg_gc("gcollect_dirent\n"); + + vnode = chfs_vnode_lookup(chmp, fd->vno); + + /* XXX maybe KASSERT or panic on this? */ + if (vnode == NULL) { + return ENOENT; + } + + ip = VTOI(vnode); + + prev = parent->chvc->dirents; + if (prev == fd->nref) { + parent->chvc->dirents = prev->nref_next; + dbg_gc("fd nref removed from dirents list\n"); + prev = NULL; + } + while (prev) { + if (prev->nref_next == fd->nref) { + prev->nref_next = fd->nref->nref_next; + dbg_gc("fd nref removed from dirents list\n"); + break; + } + prev = prev->nref_next; + } + + prev = fd->nref; + chfs_mark_node_obsolete(chmp, fd->nref); + return chfs_write_flash_dirent(chmp, + parent, ip, fd, fd->vno, ALLOC_GC); +} + +/* Check dirents what are marked as deleted. */ +int +chfs_gcollect_deletion_dirent(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb, struct chfs_inode *parent, + struct chfs_dirent *fd) +{ + struct chfs_flash_dirent_node chfdn; + struct chfs_node_ref *nref; + size_t retlen, name_len, nref_len; + uint32_t name_crc; + + int ret; + + struct vnode *vnode = NULL; + + dbg_gc("gcollect_deletion_dirent\n"); + + name_len = strlen(fd->name); + name_crc = crc32(0, fd->name, name_len); + + nref_len = chfs_nref_len(chmp, cheb, fd->nref); + + vnode = chfs_vnode_lookup(chmp, fd->vno); + + //dbg_gc("ip from vnode\n"); + //VFS_VGET(chmp->chm_fsmp, fd->vno, &vnode); + //ip = VTOI(vnode); + //vput(vnode); + + //dbg_gc("mutex enter erase_completion_lock\n"); + +// dbg_gc("alloc chfdn\n"); +// chfdn = kmem_alloc(nref_len, KM_SLEEP); +// if (!chfdn) +// return ENOMEM; + + for (nref = parent->chvc->dirents; + nref != (void*)parent->chvc; + nref = nref->nref_next) { + + if (!CHFS_REF_OBSOLETE(nref)) + continue; + + /* if node refs have different length, skip */ + if (chfs_nref_len(chmp, NULL, nref) != nref_len) + continue; + + if (CHFS_GET_OFS(nref->nref_offset) == + CHFS_GET_OFS(fd->nref->nref_offset)) { + continue; + } + + ret = chfs_read_leb(chmp, + nref->nref_lnr, (void*)&chfdn, CHFS_GET_OFS(nref->nref_offset), + nref_len, &retlen); + + if (ret) { + dbg_gc("Read error: %d\n", ret); + continue; + } + + if (retlen != nref_len) { + dbg_gc("Error reading node:" + " read: %zu insted of: %zu\n", retlen, nref_len); + continue; + } + + /* if node type doesn't match, skip */ + if (le16toh(chfdn.type) != CHFS_NODETYPE_DIRENT) + continue; + + /* if crc doesn't match, skip */ + if (le32toh(chfdn.name_crc) != name_crc) + continue; + + /* if length of name different, or this is an another deletion + * dirent, skip + */ + if (chfdn.nsize != name_len || !le64toh(chfdn.vno)) + continue; + + /* check actual name */ + if (memcmp(chfdn.name, fd->name, name_len)) + continue; + +// kmem_free(chfdn, nref_len); + + chfs_mark_node_obsolete(chmp, fd->nref); + return chfs_write_flash_dirent(chmp, + parent, NULL, fd, fd->vno, ALLOC_GC); + } + +// kmem_free(chfdn, nref_len); + + TAILQ_REMOVE(&parent->dents, fd, fds); + chfs_free_dirent(fd); + return 0; +} + +int +chfs_gcollect_dnode(struct chfs_mount *chmp, + struct chfs_eraseblock *orig_cheb, struct chfs_inode *ip, + struct chfs_full_dnode *fn, uint32_t orig_start, uint32_t orig_end) +{ + struct chfs_node_ref *nref, *prev; + struct chfs_full_dnode *newfn; + struct chfs_flash_data_node *fdnode; + int ret = 0, retries = 0; + uint32_t totlen; + char *data = NULL; + struct iovec vec; + size_t retlen; + dbg_gc("gcollect_dnode\n"); + + //uint32_t used_size; + +/* TODO GC merging frags, should we use it? + + uint32_t start, end; + + start = orig_start; + end = orig_end; + + if (chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks > chmp->chm_resv_blocks_gcmerge) { + struct chfs_node_frag *frag; + uint32_t min, max; + + min = start & (PAGE_CACHE_SIZE-1); + max = min + PAGE_CACHE_SIZE; + + frag = (struct chfs_node_frag *)rb_tree_find_node_leq(&ip->i_chfs_ext.fragtree, &start); + KASSERT(frag->ofs == start); + + while ((frag = frag_prev(&ip->i_chfs_ext.fragtree, frag)) && frag->ofs >= min) { + if (frag->ofs > min) { + start = frag->ofs; + continue; + } + + if (!frag->node || !frag->node->nref) { + break; + } else { + struct chfs_node_ref *nref = frag->node->nref; + struct chfs_eraseblock *cheb; + + cheb = &chmp->chm_blocks[nref->nref_lnr]; + + if (cheb == chmp->chm_gcblock) + start = frag->ofs; + + //TODO is this a clean block? + + start = frag->ofs; + break; + } + } + + end--; + frag = (struct chfs_node_frag *)rb_tree_find_node_leq(&ip->i_chfs_ext.fragtree, &(end)); + + while ((frag = frag_next(&ip->i_chfs_ext.fragtree, frag)) && (frag->ofs + frag->size <= max)) { + if (frag->ofs + frag->size < max) { + end = frag->ofs + frag->size; + continue; + } + + if (!frag->node || !frag->node->nref) { + break; + } else { + struct chfs_node_ref *nref = frag->node->nref; + struct chfs_eraseblock *cheb; + + cheb = &chmp->chm_blocks[nref->nref_lnr]; + + if (cheb == chmp->chm_gcblock) + end = frag->ofs + frag->size; + + //TODO is this a clean block? + + end = frag->ofs + frag->size; + break; + } + } + + KASSERT(end <= + frag_last(&ip->i_chfs_ext.fragtree)->ofs + + frag_last(&ip->i_chfs_ext.fragtree)->size); + KASSERT(end >= orig_end); + KASSERT(start <= orig_start); + } +*/ + KASSERT(orig_cheb->lnr == fn->nref->nref_lnr); + totlen = chfs_nref_len(chmp, orig_cheb, fn->nref); + data = kmem_alloc(totlen, KM_SLEEP); + + ret = chfs_read_leb(chmp, fn->nref->nref_lnr, data, fn->nref->nref_offset, + totlen, &retlen); + + fdnode = (struct chfs_flash_data_node *)data; + fdnode->version = htole64(++ip->chvc->highest_version); + fdnode->node_crc = htole32(crc32(0, (uint8_t *)fdnode, + sizeof(*fdnode) - 4)); + + vec.iov_base = (void *)data; + vec.iov_len = totlen; + +retry: + ret = chfs_reserve_space_gc(chmp, totlen); + if (ret) + goto out; + + nref = chfs_alloc_node_ref(chmp->chm_nextblock); + if (!nref) { + ret = ENOMEM; + goto out; + } + + mutex_enter(&chmp->chm_lock_sizes); + + nref->nref_offset = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size; + KASSERT(nref->nref_offset % 4 == 0); + chfs_change_size_free(chmp, chmp->chm_nextblock, -totlen); + + ret = chfs_write_wbuf(chmp, &vec, 1, nref->nref_offset, &retlen); + if (ret || retlen != totlen) { + chfs_err("error while writing out to the media\n"); + chfs_err("err: %d | size: %d | retlen : %zu\n", + ret, totlen, retlen); + chfs_change_size_dirty(chmp, chmp->chm_nextblock, totlen); + if (retries) { + ret = EIO; + mutex_exit(&chmp->chm_lock_sizes); + goto out; + } + + retries++; + mutex_exit(&chmp->chm_lock_sizes); + goto retry; + } + + dbg_gc("new nref lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset); + + chfs_change_size_used(chmp, &chmp->chm_blocks[nref->nref_lnr], totlen); + mutex_exit(&chmp->chm_lock_sizes); + KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size); + + newfn = chfs_alloc_full_dnode(); + newfn->nref = nref; + newfn->ofs = fn->ofs; + newfn->size = fn->size; + newfn->frags = fn->frags; + + //TODO should we remove fd from dnode list? + + prev = ip->chvc->dnode; + if (prev == fn->nref) { + ip->chvc->dnode = prev->nref_next; + prev = NULL; + } + while (prev) { + if (prev->nref_next == fn->nref) { + prev->nref_next = fn->nref->nref_next; + break; + } + prev = prev->nref_next; + } + + chfs_add_full_dnode_to_inode(chmp, ip, newfn); + chfs_add_node_to_list(chmp, + ip->chvc, newfn->nref, &ip->chvc->dnode); + +out: + kmem_free(data, totlen); + return ret; +} diff --git a/sys/ufs/chfs/chfs_ihash.c b/sys/ufs/chfs/chfs_ihash.c new file mode 100644 index 000000000..b16b00c6a --- /dev/null +++ b/sys/ufs/chfs/chfs_ihash.c @@ -0,0 +1,220 @@ +/* $NetBSD: chfs_ihash.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "chfs.h" +/* + * Structures associated with inode cacheing. + */ +static LIST_HEAD(ihashhead, chfs_inode) *chfs_ihashtbl; +static u_long chfs_ihash; /* size of hash table - 1 */ +#define INOHASH(device, inum) (((device) + (inum)) & chfs_ihash) + +kmutex_t chfs_ihash_lock; +kmutex_t chfs_hashlock; + +/* + * Initialize inode hash table. + */ +void +chfs_ihashinit(void) +{ + dbg("initing\n"); + + mutex_init(&chfs_hashlock, MUTEX_DEFAULT, IPL_NONE); + mutex_init(&chfs_ihash_lock, MUTEX_DEFAULT, IPL_NONE); + chfs_ihashtbl = hashinit(desiredvnodes, + HASH_LIST, true, &chfs_ihash); +} + +/* + * Reinitialize inode hash table. + */ + +void +chfs_ihashreinit(void) +{ + struct chfs_inode *ip; + struct ihashhead *oldhash, *hash; + u_long oldmask, mask, val; + int i; + + dbg("reiniting\n"); + + hash = hashinit(desiredvnodes, HASH_LIST, true, &mask); + mutex_enter(&chfs_ihash_lock); + oldhash = chfs_ihashtbl; + oldmask = chfs_ihash; + chfs_ihashtbl = hash; + chfs_ihash = mask; + for (i = 0; i <= oldmask; i++) { + while ((ip = LIST_FIRST(&oldhash[i])) != NULL) { + LIST_REMOVE(ip, hash_entry); + val = INOHASH(ip->dev, ip->ino); + LIST_INSERT_HEAD(&hash[val], ip, hash_entry); + } + } + mutex_exit(&chfs_ihash_lock); + hashdone(oldhash, HASH_LIST, oldmask); +} + +/* + * Free inode hash table. + */ +void +chfs_ihashdone(void) +{ + dbg("destroying\n"); + + hashdone(chfs_ihashtbl, HASH_LIST, chfs_ihash); + mutex_destroy(&chfs_hashlock); + mutex_destroy(&chfs_ihash_lock); +} + +/* + * Use the device/inum pair to find the incore inode, and return a pointer + * to it. If it is in core, return it, even if it is locked. + */ +struct vnode * +chfs_ihashlookup(dev_t dev, ino_t inum) +{ + struct chfs_inode *ip; + struct ihashhead *ipp; + + dbg("dev: %ju, inum: %ju\n", (uintmax_t )dev, (uintmax_t )inum); + + KASSERT(mutex_owned(&chfs_ihash_lock)); + + ipp = &chfs_ihashtbl[INOHASH(dev, inum)]; + LIST_FOREACH(ip, ipp, hash_entry) { + if (inum == ip->ino && dev == ip->dev) { + break; + } + } + + if (ip) { + return (ITOV(ip)); + } + + return (NULLVP); +} + +/* + * Use the device/inum pair to find the incore inode, and return a pointer + * to it. If it is in core, but locked, wait for it. + */ +struct vnode * +chfs_ihashget(dev_t dev, ino_t inum, int flags) +{ + struct ihashhead *ipp; + struct chfs_inode *ip; + struct vnode *vp; + + dbg("search for ino\n"); + +loop: + mutex_enter(&chfs_ihash_lock); + ipp = &chfs_ihashtbl[INOHASH(dev, inum)]; + dbg("ipp: %p, chfs_ihashtbl: %p, ihash: %lu\n", + ipp, chfs_ihashtbl, chfs_ihash); + LIST_FOREACH(ip, ipp, hash_entry) { + dbg("ip: %p\n", ip); + if (inum == ip->ino && dev == ip->dev) { +// printf("chfs_ihashget: found inode: %p\n", ip); + vp = ITOV(ip); + KASSERT(vp != NULL); + //dbg("found\n"); + if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) { + //dbg("wait for #%llu\n", ip->ino); + mutex_exit(&chfs_ihash_lock); + goto loop; + } + /* + if (VOP_ISLOCKED(vp)) + dbg("locked\n"); + else + dbg("isn't locked\n"); + */ + if (flags == 0) { + //dbg("no flags\n"); + mutex_exit(&chfs_ihash_lock); + } else { + //dbg("vget\n"); + mutex_enter(vp->v_interlock); + mutex_exit(&chfs_ihash_lock); + if (vget(vp, flags)) { + goto loop; + } + //dbg("got it\n"); + } + //dbg("return\n"); + return (vp); + } + } + //dbg("not found\n"); + mutex_exit(&chfs_ihash_lock); + return (NULL); +} + +/* + * Insert the inode into the hash table, and return it locked. + */ +void +chfs_ihashins(struct chfs_inode *ip) +{ + struct ihashhead *ipp; + + dbg("ip: %p\n", ip); + + KASSERT(mutex_owned(&chfs_hashlock)); + + /* lock the inode, then put it on the appropriate hash list */ + VOP_LOCK(ITOV(ip), LK_EXCLUSIVE); + + mutex_enter(&chfs_ihash_lock); + ipp = &chfs_ihashtbl[INOHASH(ip->dev, ip->ino)]; + LIST_INSERT_HEAD(ipp, ip, hash_entry); + mutex_exit(&chfs_ihash_lock); +} + +/* + * Remove the inode from the hash table. + */ +void +chfs_ihashrem(struct chfs_inode *ip) +{ + dbg("ip: %p\n", ip); + + mutex_enter(&chfs_ihash_lock); + LIST_REMOVE(ip, hash_entry); + mutex_exit(&chfs_ihash_lock); +} + diff --git a/include/ufs/chfs/chfs_inode.h b/sys/ufs/chfs/chfs_inode.h similarity index 100% rename from include/ufs/chfs/chfs_inode.h rename to sys/ufs/chfs/chfs_inode.h diff --git a/sys/ufs/chfs/chfs_malloc.c b/sys/ufs/chfs/chfs_malloc.c new file mode 100644 index 000000000..3138acc00 --- /dev/null +++ b/sys/ufs/chfs/chfs_malloc.c @@ -0,0 +1,396 @@ +/* $NetBSD: chfs_malloc.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * Copyright (C) 2010 Tamas Toth + * Copyright (C) 2010 Adam Hoka + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "chfs.h" +#include + +pool_cache_t chfs_vnode_cache; +pool_cache_t chfs_nrefs_cache; +pool_cache_t chfs_flash_vnode_cache; +pool_cache_t chfs_flash_dirent_cache; +pool_cache_t chfs_flash_dnode_cache; +pool_cache_t chfs_node_frag_cache; +pool_cache_t chfs_tmp_dnode_cache; +pool_cache_t chfs_tmp_dnode_info_cache; + +int +chfs_alloc_pool_caches() +{ + chfs_vnode_cache = pool_cache_init( + sizeof(struct chfs_vnode_cache), + 0, 0, 0, "chfs_vnode_cache", NULL, IPL_NONE, NULL, NULL, + NULL); + if (!chfs_vnode_cache) + goto err_vnode; + + chfs_nrefs_cache = pool_cache_init( + (REFS_BLOCK_LEN + 1) * sizeof(struct chfs_node_ref), 0, 0, + 0, "chfs_nrefs_pool", NULL, IPL_NONE, NULL, NULL, NULL); + if (!chfs_nrefs_cache) + goto err_nrefs; + + chfs_flash_vnode_cache = pool_cache_init( + sizeof(struct chfs_flash_vnode), 0, 0, 0, + "chfs_flash_vnode_pool", NULL, IPL_NONE, NULL, NULL, NULL); + if (!chfs_flash_vnode_cache) + goto err_flash_vnode; + + chfs_flash_dirent_cache = pool_cache_init( + sizeof(struct chfs_flash_dirent_node), 0, 0, 0, + "chfs_flash_dirent_pool", NULL, IPL_NONE, NULL, NULL, NULL); + if (!chfs_flash_dirent_cache) + goto err_flash_dirent; + + chfs_flash_dnode_cache = pool_cache_init( + sizeof(struct chfs_flash_data_node), 0, 0, 0, + "chfs_flash_dnode_pool", NULL, IPL_NONE, NULL, NULL, NULL); + if (!chfs_flash_dnode_cache) + goto err_flash_dnode; + + chfs_node_frag_cache = pool_cache_init( + sizeof(struct chfs_node_frag), 0, 0, 0, + "chfs_node_frag_pool", NULL, IPL_NONE, NULL, NULL, NULL); + if (!chfs_node_frag_cache) + goto err_node_frag; + + chfs_tmp_dnode_cache = pool_cache_init( + sizeof(struct chfs_tmp_dnode), 0, 0, 0, + "chfs_tmp_dnode_pool", NULL, IPL_NONE, NULL, NULL, NULL); + if (!chfs_tmp_dnode_cache) + goto err_tmp_dnode; + + chfs_tmp_dnode_info_cache = pool_cache_init( + sizeof(struct chfs_tmp_dnode_info), 0, 0, 0, + "chfs_tmp_dnode_info_pool", NULL, IPL_NONE, NULL, NULL, NULL); + if (!chfs_tmp_dnode_info_cache) + goto err_tmp_dnode_info; + + return 0; + +err_tmp_dnode_info: + pool_cache_destroy(chfs_tmp_dnode_cache); +err_tmp_dnode: + pool_cache_destroy(chfs_node_frag_cache); +err_node_frag: + pool_cache_destroy(chfs_flash_dnode_cache); +err_flash_dnode: + pool_cache_destroy(chfs_flash_dirent_cache); +err_flash_dirent: + pool_cache_destroy(chfs_flash_vnode_cache); +err_flash_vnode: + pool_cache_destroy(chfs_nrefs_cache); +err_nrefs: + pool_cache_destroy(chfs_vnode_cache); +err_vnode: + + return ENOMEM; +} + +void +chfs_destroy_pool_caches() +{ + if (chfs_vnode_cache) + pool_cache_destroy(chfs_vnode_cache); + + if (chfs_nrefs_cache) + pool_cache_destroy(chfs_nrefs_cache); + + if (chfs_flash_vnode_cache) + pool_cache_destroy(chfs_flash_vnode_cache); + + if (chfs_flash_dirent_cache) + pool_cache_destroy(chfs_flash_dirent_cache); + + if (chfs_flash_dnode_cache) + pool_cache_destroy(chfs_flash_dnode_cache); + + if (chfs_node_frag_cache) + pool_cache_destroy(chfs_node_frag_cache); + + if (chfs_tmp_dnode_cache) + pool_cache_destroy(chfs_tmp_dnode_cache); + + if (chfs_tmp_dnode_info_cache) + pool_cache_destroy(chfs_tmp_dnode_info_cache); +} + +struct chfs_vnode_cache * +chfs_vnode_cache_alloc(ino_t vno) +{ + struct chfs_vnode_cache* vc; + vc = pool_cache_get(chfs_vnode_cache, PR_WAITOK); + + memset(vc, 0, sizeof(*vc)); + vc->vno = vno; + vc->v = (void *)vc; + vc->dirents = (void *)vc; + vc->dnode = (void *)vc; + TAILQ_INIT(&vc->scan_dirents); + vc->highest_version = 0; + + return vc; +} + +void +chfs_vnode_cache_free(struct chfs_vnode_cache *vc) +{ + //kmem_free(vc->vno_version, sizeof(uint64_t)); + pool_cache_put(chfs_vnode_cache, vc); +} + +/** + * chfs_alloc_refblock - allocating a refblock + * + * Returns a pointer of the first element in the block. + * + * We are not allocating just one node ref, instead we allocating REFS_BLOCK_LEN + * number of node refs, the last element will be a pointer to the next block. + * We do this, because we need a chain of nodes which have been ordered by the + * physical address of them. + * + */ +struct chfs_node_ref* +chfs_alloc_refblock(void) +{ + int i; + struct chfs_node_ref *nref; + nref = pool_cache_get(chfs_nrefs_cache, PR_WAITOK); + + for (i = 0; i < REFS_BLOCK_LEN; i++) { + nref[i].nref_lnr = REF_EMPTY_NODE; + nref[i].nref_next = NULL; + } + i = REFS_BLOCK_LEN; + nref[i].nref_lnr = REF_LINK_TO_NEXT; + nref[i].nref_next = NULL; + + return nref; +} + +/** + * chfs_free_refblock - freeing a refblock + */ +void +chfs_free_refblock(struct chfs_node_ref *nref) +{ + pool_cache_put(chfs_nrefs_cache, nref); +} + +/** + * chfs_alloc_node_ref - allocating a node ref from a refblock + * @cheb: eraseblock information structure + * + * Allocating a node ref from a refblock, it there isn't any free element in the + * block, a new block will be allocated and be linked to the current block. + */ +struct chfs_node_ref* +chfs_alloc_node_ref(struct chfs_eraseblock *cheb) +{ + struct chfs_node_ref *nref, *new, *old; + old = cheb->last_node; + nref = cheb->last_node; + + if (!nref) { + //There haven't been any nref allocated for this block yet + nref = chfs_alloc_refblock(); + + cheb->first_node = nref; + cheb->last_node = nref; + nref->nref_lnr = cheb->lnr; + KASSERT(cheb->lnr == nref->nref_lnr); + + return nref; + } + + nref++; + if (nref->nref_lnr == REF_LINK_TO_NEXT) { + new = chfs_alloc_refblock(); + nref->nref_next = new; + nref = new; + } + + cheb->last_node = nref; + nref->nref_lnr = cheb->lnr; + + KASSERT(old->nref_lnr == nref->nref_lnr && + nref->nref_lnr == cheb->lnr); + + return nref; +} + +/** + * chfs_free_node_refs - freeing an eraseblock's node refs + * @cheb: eraseblock information structure + */ +void +chfs_free_node_refs(struct chfs_eraseblock *cheb) +{ + struct chfs_node_ref *nref, *block; + + block = nref = cheb->first_node; + + while (nref) { + if (nref->nref_lnr == REF_LINK_TO_NEXT) { + nref = nref->nref_next; + chfs_free_refblock(block); + block = nref; + continue; + } + nref++; + } +} + +struct chfs_dirent* +chfs_alloc_dirent(int namesize) +{ + struct chfs_dirent *ret; + size_t size = sizeof(struct chfs_dirent) + namesize; + + ret = kmem_alloc(size, KM_SLEEP); + //ret->alloc_size = size; + + return ret; +} + +void +chfs_free_dirent(struct chfs_dirent *dirent) +{ + //size_t size = dirent->alloc_size; + size_t size = sizeof(struct chfs_dirent) + dirent->nsize + 1; + + kmem_free(dirent, size); +} + +struct chfs_full_dnode* +chfs_alloc_full_dnode() +{ + struct chfs_full_dnode *ret; + ret = kmem_alloc(sizeof(struct chfs_full_dnode), KM_SLEEP); + return ret; +} + +void +chfs_free_full_dnode(struct chfs_full_dnode *fd) +{ + kmem_free(fd,(sizeof(struct chfs_full_dnode))); +} + +struct chfs_flash_vnode* +chfs_alloc_flash_vnode() +{ + struct chfs_flash_vnode *ret; + ret = pool_cache_get(chfs_flash_vnode_cache, 0); + return ret; +} + +void +chfs_free_flash_vnode(struct chfs_flash_vnode *fvnode) +{ + pool_cache_put(chfs_flash_vnode_cache, fvnode); +} + +struct chfs_flash_dirent_node* +chfs_alloc_flash_dirent() +{ + struct chfs_flash_dirent_node *ret; + ret = pool_cache_get(chfs_flash_dirent_cache, 0); + return ret; +} + +void +chfs_free_flash_dirent(struct chfs_flash_dirent_node *fdnode) +{ + pool_cache_put(chfs_flash_dirent_cache, fdnode); +} + +struct chfs_flash_data_node* +chfs_alloc_flash_dnode() +{ + struct chfs_flash_data_node *ret; + ret = pool_cache_get(chfs_flash_dnode_cache, 0); + return ret; +} + +void +chfs_free_flash_dnode(struct chfs_flash_data_node *fdnode) +{ + pool_cache_put(chfs_flash_dnode_cache, fdnode); +} + + +struct chfs_node_frag* +chfs_alloc_node_frag() +{ + struct chfs_node_frag *ret; + ret = pool_cache_get(chfs_node_frag_cache, 0); + return ret; + +} + +void +chfs_free_node_frag(struct chfs_node_frag *frag) +{ + pool_cache_put(chfs_node_frag_cache, frag); +} + +struct chfs_tmp_dnode * +chfs_alloc_tmp_dnode() +{ + struct chfs_tmp_dnode *ret; + ret = pool_cache_get(chfs_tmp_dnode_cache, 0); + ret->next = NULL; + return ret; +} + +void +chfs_free_tmp_dnode(struct chfs_tmp_dnode *td) +{ + pool_cache_put(chfs_tmp_dnode_cache, td); +} + +struct chfs_tmp_dnode_info * +chfs_alloc_tmp_dnode_info() +{ + struct chfs_tmp_dnode_info *ret; + ret = pool_cache_get(chfs_tmp_dnode_info_cache, 0); + ret->tmpnode = NULL; + return ret; +} + +void +chfs_free_tmp_dnode_info(struct chfs_tmp_dnode_info *di) +{ + pool_cache_put(chfs_tmp_dnode_info_cache, di); +} + diff --git a/sys/ufs/chfs/chfs_nodeops.c b/sys/ufs/chfs/chfs_nodeops.c new file mode 100644 index 000000000..bf761dd66 --- /dev/null +++ b/sys/ufs/chfs/chfs_nodeops.c @@ -0,0 +1,570 @@ +/* $NetBSD: chfs_nodeops.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * Copyright (C) 2010 David Tengeri + * Copyright (C) 2010 Tamas Toth + * Copyright (C) 2010 Adam Hoka + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "chfs.h" + +/** + * chfs_update_eb_dirty - updates dirty and free space, first and + * last node references + * @sbi: CHFS main descriptor structure + * @cheb: eraseblock to update + * @size: increase dirty space size with this + * Returns zero in case of success, %1 in case of fail. + */ +int +chfs_update_eb_dirty(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb, uint32_t size) +{ + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + KASSERT(!mutex_owned(&chmp->chm_lock_sizes)); + + if (!size) + return 0; + + if (size > cheb->free_size) { + chfs_err("free_size (%d) is less then dirty space (%d) " + "on block (%d)\n", cheb->free_size, size, cheb->lnr); + return 1; + } + mutex_enter(&chmp->chm_lock_sizes); + //dbg("BEFORE: free_size: %d\n", cheb->free_size); + chfs_change_size_free(chmp, cheb, -size); + chfs_change_size_dirty(chmp, cheb, size); + //dbg(" AFTER: free_size: %d\n", cheb->free_size); + mutex_exit(&chmp->chm_lock_sizes); + return 0; +} + +/** + * chfs_add_node_to_list - adds a data node ref to vnode cache's dnode list + * @sbi: super block informations + * @new: node ref to insert + * @list: head of the list + * This function inserts a data node ref to the list of vnode cache. + * The list is sorted by data node's lnr and offset. + */ +void +chfs_add_node_to_list(struct chfs_mount *chmp, + struct chfs_vnode_cache *vc, + struct chfs_node_ref *new, struct chfs_node_ref **list) +{ + struct chfs_node_ref *nextref = *list; + struct chfs_node_ref *prevref = NULL; + + while (nextref && nextref != (struct chfs_node_ref *)vc && + (nextref->nref_lnr <= new->nref_lnr)) { + if (nextref->nref_lnr == new->nref_lnr) { + while (nextref && nextref != + (struct chfs_node_ref *)vc && + (CHFS_GET_OFS(nextref->nref_offset) < + CHFS_GET_OFS(new->nref_offset))) { + prevref = nextref; + nextref = nextref->nref_next; + } + break; + } + prevref = nextref; + nextref = nextref->nref_next; + } + + if (nextref && nextref != (struct chfs_node_ref *)vc && + nextref->nref_lnr == new->nref_lnr && + CHFS_GET_OFS(nextref->nref_offset) == + CHFS_GET_OFS(new->nref_offset)) { + new->nref_next = nextref->nref_next; + } else { + new->nref_next = nextref; + } + + if (prevref) { + prevref->nref_next = new; + } else { + *list = new; + } +} + +void +chfs_add_fd_to_inode(struct chfs_mount *chmp, + struct chfs_inode *parent, struct chfs_dirent *new) +{ +// struct chfs_dirent **prev = &parent->dents; + struct chfs_dirent *fd, *tmpfd; + + if (new->version > parent->chvc->highest_version) { + parent->chvc->highest_version = new->version; + } + + //mutex_enter(&parent->inode_lock); + TAILQ_FOREACH_SAFE(fd, &parent->dents, fds, tmpfd) { + if (fd->nhash > new->nhash) { + /* insert new before fd */ + TAILQ_INSERT_BEFORE(fd, new, fds); + return; + } else if (fd->nhash == new->nhash && + !strcmp(fd->name, new->name)) { + if (new->version > fd->version) { +// new->next = fd->next; + /* replace fd with new */ + TAILQ_INSERT_BEFORE(fd, new, fds); + TAILQ_REMOVE(&parent->dents, fd, fds); + if (fd->nref) { + chfs_mark_node_obsolete(chmp, + fd->nref); + } + chfs_free_dirent(fd); +// *prev = new;//XXX + } else { + chfs_mark_node_obsolete(chmp, new->nref); + chfs_free_dirent(new); + } + return; + } + } + /* if we couldnt fit it elsewhere, lets add to the end */ + /* FIXME insert tail or insert head? */ + TAILQ_INSERT_HEAD(&parent->dents, new, fds); + //mutex_exit(&parent->inode_lock); +#if 0 + while ((*prev) && (*prev)->nhash <= new->nhash) { + if ((*prev)->nhash == new->nhash && + !strcmp((*prev)->name, new->name)) { + if (new->version > (*prev)->version) { + new->next = (*prev)->next; + if ((*prev)->nref) { + chfs_mark_node_obsolete(chmp, + (*prev)->nref); + } + chfs_free_dirent(*prev); + *prev = new; + } else { + chfs_mark_node_obsolete(chmp, new->nref); + chfs_free_dirent(new); + } + return; + } + prev = &((*prev)->next); + } + + new->next = *prev; + *prev = new; +#endif +} + +void +chfs_add_vnode_ref_to_vc(struct chfs_mount *chmp, + struct chfs_vnode_cache *vc, struct chfs_node_ref *new) +{ + if ((struct chfs_vnode_cache*)(vc->v) != vc) { + chfs_mark_node_obsolete(chmp, vc->v); + new->nref_next = vc->v->nref_next; + } else { + new->nref_next = vc->v; + } + vc->v = new; +} + +struct chfs_node_ref * +chfs_nref_next(struct chfs_node_ref *nref) +{ +// dbg("check nref: %u - %u\n", nref->nref_lnr, nref->nref_offset); + nref++; +// dbg("next nref: %u - %u\n", nref->nref_lnr, nref->nref_offset); + if (nref->nref_lnr == REF_LINK_TO_NEXT) { + //End of chain + if (!nref->nref_next) + return NULL; + + nref = nref->nref_next; + } + //end of chain + if (nref->nref_lnr == REF_EMPTY_NODE) + return NULL; + + return nref; +} + +int +chfs_nref_len(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb, struct chfs_node_ref *nref) +{ + struct chfs_node_ref *next; + + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + if (!cheb) + cheb = &chmp->chm_blocks[nref->nref_lnr]; + + next = chfs_nref_next(nref); + + if (!next) { + //dbg("next null\n"); + return chmp->chm_ebh->eb_size - cheb->free_size - + CHFS_GET_OFS(nref->nref_offset); + } + //dbg("size: %d\n", CHFS_GET_OFS(next->nref_offset) - CHFS_GET_OFS(nref->nref_offset)); + return CHFS_GET_OFS(next->nref_offset) - + CHFS_GET_OFS(nref->nref_offset); +} + +/** + * chfs_mark_node_obsolete - marks a node obsolete + */ +void +chfs_mark_node_obsolete(struct chfs_mount *chmp, + struct chfs_node_ref *nref) +{ + int len; + struct chfs_eraseblock *cheb; + + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + KASSERT(!CHFS_REF_OBSOLETE(nref)); + + KASSERT(nref->nref_lnr <= chmp->chm_ebh->peb_nr); + cheb = &chmp->chm_blocks[nref->nref_lnr]; + +#ifdef DIAGNOSTIC + if (cheb->used_size + cheb->free_size + cheb->dirty_size + + cheb->unchecked_size + cheb->wasted_size != chmp->chm_ebh->eb_size) { + dbg("eraseblock leak detected!\nused: %u\nfree: %u\n" + "dirty: %u\nunchecked: %u\nwasted: %u\ntotal: %u\nshould be: %zu\n", + cheb->used_size, cheb->free_size, cheb->dirty_size, + cheb->unchecked_size, cheb->wasted_size, cheb->used_size + cheb->free_size + + cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size, + chmp->chm_ebh->eb_size); + } +#endif + + len = chfs_nref_len(chmp, cheb, nref); + //dbg("len: %u\n", len); + //dbg("1. used: %u\n", cheb->used_size); + + mutex_enter(&chmp->chm_lock_sizes); + + if (CHFS_REF_FLAGS(nref) == CHFS_UNCHECKED_NODE_MASK) { + //dbg("UNCHECKED mark an unchecked node\n"); + chfs_change_size_unchecked(chmp, cheb, -len); + //dbg("unchecked: %u\n", chmp->chm_unchecked_size); + } else { + chfs_change_size_used(chmp, cheb, -len); + + //dbg("2. used: %u\n", cheb->used_size); + KASSERT(cheb->used_size <= chmp->chm_ebh->eb_size); + } + chfs_change_size_dirty(chmp, cheb, len); + +#ifdef DIAGNOSTIC + if (cheb->used_size + cheb->free_size + cheb->dirty_size + + cheb->unchecked_size + cheb->wasted_size != chmp->chm_ebh->eb_size) { + panic("eraseblock leak detected!\nused: %u\nfree: %u\n" + "dirty: %u\nunchecked: %u\nwasted: %u\ntotal: %u\nshould be: %zu\n", + cheb->used_size, cheb->free_size, cheb->dirty_size, + cheb->unchecked_size, cheb->wasted_size, cheb->used_size + cheb->free_size + + cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size, + chmp->chm_ebh->eb_size); + } +#endif + nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) | + CHFS_OBSOLETE_NODE_MASK; + + if (chmp->chm_flags & CHFS_MP_FLAG_SCANNING) { + /*Scan is in progress, do nothing now*/ + mutex_exit(&chmp->chm_lock_sizes); + return; + } + + if (cheb == chmp->chm_nextblock) { + dbg("Not moving nextblock to dirty/erase_pending list\n"); + } else if (!cheb->used_size && !cheb->unchecked_size) { + if (cheb == chmp->chm_gcblock) { + dbg("gcblock is completely dirtied\n"); + chmp->chm_gcblock = NULL; + } else { + //remove from a tailq, but we don't know which tailq contains this cheb + //so we remove it from the dirty list now + //TAILQ_REMOVE(&chmp->chm_dirty_queue, cheb, queue); + int removed = 0; + struct chfs_eraseblock *eb, *tmpeb; + //XXX ugly code + TAILQ_FOREACH_SAFE(eb, &chmp->chm_free_queue, queue, tmpeb) { + if (eb == cheb) { + TAILQ_REMOVE(&chmp->chm_free_queue, cheb, queue); + removed = 1; + break; + } + } + if (removed == 0) { + TAILQ_FOREACH_SAFE(eb, &chmp->chm_dirty_queue, queue, tmpeb) { + if (eb == cheb) { + TAILQ_REMOVE(&chmp->chm_dirty_queue, cheb, queue); + removed = 1; + break; + } + } + } + if (removed == 0) { + TAILQ_FOREACH_SAFE(eb, &chmp->chm_very_dirty_queue, queue, tmpeb) { + if (eb == cheb) { + TAILQ_REMOVE(&chmp->chm_very_dirty_queue, cheb, queue); + removed = 1; + break; + } + } + } + if (removed == 0) { + TAILQ_FOREACH_SAFE(eb, &chmp->chm_clean_queue, queue, tmpeb) { + if (eb == cheb) { + TAILQ_REMOVE(&chmp->chm_clean_queue, cheb, queue); + removed = 1; + break; + } + } + } + } + if (chmp->chm_wbuf_len) { + dbg("Adding block to erasable pending wbuf queue\n"); + TAILQ_INSERT_TAIL(&chmp->chm_erasable_pending_wbuf_queue, + cheb, queue); + } else { + TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue, + cheb, queue); + chmp->chm_nr_erasable_blocks++; + } + chfs_remap_leb(chmp); + } else if (cheb == chmp->chm_gcblock) { + dbg("Not moving gcblock to dirty list\n"); + } else if (cheb->dirty_size > MAX_DIRTY_TO_CLEAN && + cheb->dirty_size - len <= MAX_DIRTY_TO_CLEAN) { + dbg("Freshly dirtied, remove it from clean queue and " + "add it to dirty\n"); + TAILQ_REMOVE(&chmp->chm_clean_queue, cheb, queue); + TAILQ_INSERT_TAIL(&chmp->chm_dirty_queue, cheb, queue); + } else if (VERY_DIRTY(chmp, cheb->dirty_size) && + !VERY_DIRTY(chmp, cheb->dirty_size - len)) { + dbg("Becomes now very dirty, remove it from dirty " + "queue and add it to very dirty\n"); + TAILQ_REMOVE(&chmp->chm_dirty_queue, cheb, queue); + TAILQ_INSERT_TAIL(&chmp->chm_very_dirty_queue, cheb, queue); + } else { + dbg("Leave cheb where it is\n"); + } + mutex_exit(&chmp->chm_lock_sizes); + return; +} + +/** + * chfs_close_eraseblock - close an eraseblock + * @chmp: chfs mount structure + * @cheb: eraseblock informations + * + * This function close the physical chain of the nodes on the eraseblock, + * convert its free size to dirty and add it to clean, dirty or very dirty list. + */ +int +chfs_close_eraseblock(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb) +{ + uint32_t offset; + struct chfs_node_ref *nref; + + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + offset = chmp->chm_ebh->eb_size - cheb->free_size; + + // Close the chain + nref = chfs_alloc_node_ref(cheb); + if (!nref) + return ENOMEM; + + nref->nref_next = NULL; + nref->nref_offset = offset; + + // Mark space as dirty + chfs_update_eb_dirty(chmp, cheb, cheb->free_size); + + if (cheb->dirty_size < MAX_DIRTY_TO_CLEAN) { + TAILQ_INSERT_TAIL(&chmp->chm_clean_queue, cheb, queue); + } else if (VERY_DIRTY(chmp, cheb->dirty_size)) { + TAILQ_INSERT_TAIL(&chmp->chm_very_dirty_queue, cheb, queue); + } else { + TAILQ_INSERT_TAIL(&chmp->chm_dirty_queue, cheb, queue); + } + return 0; +} + +int +chfs_reserve_space_normal(struct chfs_mount *chmp, uint32_t size, int prio) +{ + int ret; + + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + mutex_enter(&chmp->chm_lock_sizes); + while (chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks < chmp->chm_resv_blocks_write) { + dbg("free: %d, erasable: %d, resv: %d\n", chmp->chm_nr_free_blocks, chmp->chm_nr_erasable_blocks, chmp->chm_resv_blocks_write); + uint32_t avail, dirty; + if (prio == ALLOC_DELETION && chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks >= chmp->chm_resv_blocks_deletion) + break; + + dirty = chmp->chm_dirty_size - chmp->chm_nr_erasable_blocks * chmp->chm_ebh->eb_size + chmp->chm_unchecked_size; + if (dirty < chmp->chm_nospc_dirty) { + dbg("dirty: %u < nospc_dirty: %u\n", dirty, chmp->chm_nospc_dirty); + ret = ENOSPC; + mutex_exit(&chmp->chm_lock_sizes); + goto out; + } + + avail = chmp->chm_free_size - (chmp->chm_resv_blocks_write * chmp->chm_ebh->eb_size); + if (size > avail) { + dbg("size: %u > avail: %u\n", size, avail); + ret = ENOSPC; + mutex_exit(&chmp->chm_lock_sizes); + goto out; + } + + mutex_exit(&chmp->chm_lock_sizes); + ret = chfs_gcollect_pass(chmp); + /* gcollect_pass exits chm_lock_mountfields */ + mutex_enter(&chmp->chm_lock_mountfields); + mutex_enter(&chmp->chm_lock_sizes); + + if (chmp->chm_nr_erasable_blocks || + !TAILQ_EMPTY(&chmp->chm_erasable_pending_wbuf_queue) || + ret == EAGAIN) { + ret = chfs_remap_leb(chmp); + } + + if (ret) { + mutex_exit(&chmp->chm_lock_sizes); + goto out; + } + } + + mutex_exit(&chmp->chm_lock_sizes); + ret = chfs_reserve_space(chmp, size); +out: + return ret; +} + + +int +chfs_reserve_space_gc(struct chfs_mount *chmp, uint32_t size) +{ + int ret; + + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + mutex_enter(&chmp->chm_lock_sizes); + chfs_remap_leb(chmp); + + if (size > chmp->chm_free_size) { + dbg("size: %u\n", size); + mutex_exit(&chmp->chm_lock_sizes); + return ENOSPC; + } + + mutex_exit(&chmp->chm_lock_sizes); + ret = chfs_reserve_space(chmp, size); + return ret; +} + +/** + * chfs_reserve_space - finds a block which free size is >= requested size + * @chmp: chfs mount point + * @size: requested size + * @len: reserved spaced will be returned in this variable; + * Returns zero in case of success, error code in case of fail. + */ +int +chfs_reserve_space(struct chfs_mount *chmp, uint32_t size) +{ + //TODO define minimum reserved blocks, which is needed for writing + //TODO check we have enough free blocks to write + //TODO if no: need erase and GC + + int err; + struct chfs_eraseblock *cheb; + + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + KASSERT(!mutex_owned(&chmp->chm_lock_sizes)); + + cheb = chmp->chm_nextblock; + //if (cheb) + //dbg("cheb->free_size %u\n", cheb->free_size); + if (cheb && size > cheb->free_size) { + dbg("size: %u > free_size: %u\n", size, cheb->free_size); + /* + * There isn't enough space on this eraseblock, we mark this as + * dirty and close the physical chain of the node refs. + */ + //Write out pending data if any + if (chmp->chm_wbuf_len) { + chfs_flush_pending_wbuf(chmp); + //FIXME need goto restart here? + } + + while (chmp->chm_wbuf_ofs < chmp->chm_ebh->eb_size) { + dbg("wbuf ofs: %zu - eb_size: %zu\n", + chmp->chm_wbuf_ofs, chmp->chm_ebh->eb_size); + chfs_flush_pending_wbuf(chmp); + } + + if (!(chmp->chm_wbuf_ofs % chmp->chm_ebh->eb_size) && !chmp->chm_wbuf_len) + chmp->chm_wbuf_ofs = 0xffffffff; + + err = chfs_close_eraseblock(chmp, cheb); + if (err) + return err; + + cheb = NULL; + } + if (!cheb) { + //get a block for nextblock + if (TAILQ_EMPTY(&chmp->chm_free_queue)) { + // If this succeeds there will be a block on free_queue + dbg("cheb remap (free: %d)\n", chmp->chm_nr_free_blocks); + err = chfs_remap_leb(chmp); + if (err) + return err; + } + cheb = TAILQ_FIRST(&chmp->chm_free_queue); + TAILQ_REMOVE(&chmp->chm_free_queue, cheb, queue); + chmp->chm_nextblock = cheb; + chmp->chm_nr_free_blocks--; + } + + return 0; +} + diff --git a/sys/ufs/chfs/chfs_pool.c b/sys/ufs/chfs/chfs_pool.c new file mode 100644 index 000000000..6e25d17f2 --- /dev/null +++ b/sys/ufs/chfs/chfs_pool.c @@ -0,0 +1,211 @@ +/* $NetBSD: chfs_pool.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Pool allocator and convenience routines for chfs. + */ + +#include + +#include +#include +#include + +#include + +#include "chfs.h" +//#include + +/* --------------------------------------------------------------------- */ + +void * chfs_pool_page_alloc(struct pool *, int); +void chfs_pool_page_free(struct pool *, void *); + +extern void* pool_page_alloc_nointr(struct pool *, int); +extern void pool_page_free_nointr(struct pool *, void *); + +/* --------------------------------------------------------------------- */ + +struct pool_allocator chfs_pool_allocator = { + .pa_alloc = chfs_pool_page_alloc, + .pa_free = chfs_pool_page_free, +}; + +/* --------------------------------------------------------------------- */ + +void +chfs_pool_init(struct chfs_pool *chpp, size_t size, const char *what, + struct chfs_mount *chmp) +{ + int cnt; + + cnt = snprintf(chpp->chp_name, sizeof(chpp->chp_name), + "%s_chfs_%p", what, chmp); + KASSERT(cnt < sizeof(chpp->chp_name)); + + pool_init(&chpp->chp_pool, size, 0, 0, 0, chpp->chp_name, + &chfs_pool_allocator, IPL_NONE); + chpp->chp_mount = chmp; +} + +/* --------------------------------------------------------------------- */ + +void +chfs_pool_destroy(struct chfs_pool *chpp) +{ + pool_destroy((struct pool *)chpp); +} + +/* --------------------------------------------------------------------- */ + +void * +chfs_pool_page_alloc(struct pool *pp, int flags) +{ + struct chfs_pool *chpp; + struct chfs_mount *chmp; + unsigned int pages; + void *page; + dbg("CHFS: pool_page_alloc()\n"); + + chpp = (struct chfs_pool *)pp; + chmp = chpp->chp_mount; + + pages = atomic_inc_uint_nv(&chmp->chm_pages_used); + if (pages >= CHFS_PAGES_MAX(chmp)) { + atomic_dec_uint(&chmp->chm_pages_used); + return NULL; + } + page = pool_page_alloc_nointr(pp, flags | PR_WAITOK); + if (page == NULL) { + atomic_dec_uint(&chmp->chm_pages_used); + } + + return page; +} + +/* --------------------------------------------------------------------- */ + +void +chfs_pool_page_free(struct pool *pp, void *v) +{ + struct chfs_pool *chpp; + struct chfs_mount *chmp; + dbg("CHFS: pool_page_free()\n"); + + chpp = (struct chfs_pool *)pp; + chmp = chpp->chp_mount; + + atomic_dec_uint(&chmp->chm_pages_used); + pool_page_free_nointr(pp, v); +} + +/* --------------------------------------------------------------------- */ + +void +chfs_str_pool_init(struct chfs_str_pool *chsp, struct chfs_mount *chmp) +{ + dbg("CHFS: str_pool_init()\n"); + + chfs_pool_init(&chsp->chsp_pool_16, 16, "str", chmp); + chfs_pool_init(&chsp->chsp_pool_32, 32, "str", chmp); + chfs_pool_init(&chsp->chsp_pool_64, 64, "str", chmp); + chfs_pool_init(&chsp->chsp_pool_128, 128, "str", chmp); + chfs_pool_init(&chsp->chsp_pool_256, 256, "str", chmp); + chfs_pool_init(&chsp->chsp_pool_512, 512, "str", chmp); + chfs_pool_init(&chsp->chsp_pool_1024, 1024, "str", chmp); +} + +/* --------------------------------------------------------------------- */ + +void +chfs_str_pool_destroy(struct chfs_str_pool *chsp) +{ + dbg("CHFS: str_pool_destroy()\n"); + + chfs_pool_destroy(&chsp->chsp_pool_16); + chfs_pool_destroy(&chsp->chsp_pool_32); + chfs_pool_destroy(&chsp->chsp_pool_64); + chfs_pool_destroy(&chsp->chsp_pool_128); + chfs_pool_destroy(&chsp->chsp_pool_256); + chfs_pool_destroy(&chsp->chsp_pool_512); + chfs_pool_destroy(&chsp->chsp_pool_1024); +} + +/* --------------------------------------------------------------------- */ + +char * +chfs_str_pool_get(struct chfs_str_pool *chsp, size_t len, int flags) +{ + struct chfs_pool *p; + dbg("CHFS: str_pool_get()\n"); + + KASSERT(len <= 1024); + + if (len <= 16) p = &chsp->chsp_pool_16; + else if (len <= 32) p = &chsp->chsp_pool_32; + else if (len <= 64) p = &chsp->chsp_pool_64; + else if (len <= 128) p = &chsp->chsp_pool_128; + else if (len <= 256) p = &chsp->chsp_pool_256; + else if (len <= 512) p = &chsp->chsp_pool_512; + else if (len <= 1024) p = &chsp->chsp_pool_1024; + else { + KASSERT(0); + p = NULL; /* Silence compiler warnings */ + } + + return (char *)CHFS_POOL_GET(p, flags); +} + +/* --------------------------------------------------------------------- */ + +void +chfs_str_pool_put(struct chfs_str_pool *chsp, char *str, size_t len) +{ + struct chfs_pool *p; + dbg("CHFS: str_pool_put()\n"); + + KASSERT(len <= 1024); + + if (len <= 16) p = &chsp->chsp_pool_16; + else if (len <= 32) p = &chsp->chsp_pool_32; + else if (len <= 64) p = &chsp->chsp_pool_64; + else if (len <= 128) p = &chsp->chsp_pool_128; + else if (len <= 256) p = &chsp->chsp_pool_256; + else if (len <= 512) p = &chsp->chsp_pool_512; + else if (len <= 1024) p = &chsp->chsp_pool_1024; + else { + KASSERT(0); + p = NULL; /* Silence compiler warnings */ + } + + CHFS_POOL_PUT(p, str); +} diff --git a/include/ufs/chfs/chfs_pool.h b/sys/ufs/chfs/chfs_pool.h similarity index 100% rename from include/ufs/chfs/chfs_pool.h rename to sys/ufs/chfs/chfs_pool.h diff --git a/sys/ufs/chfs/chfs_readinode.c b/sys/ufs/chfs/chfs_readinode.c new file mode 100644 index 000000000..3ae626f8d --- /dev/null +++ b/sys/ufs/chfs/chfs_readinode.c @@ -0,0 +1,1136 @@ +/* $NetBSD: chfs_readinode.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * Copyright (C) 2010 David Tengeri + * Copyright (C) 2010 Tamas Toth + * Copyright (C) 2010 Adam Hoka + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * chfs_readinode.c + * + * Created on: 2010.05.31. + * Author: dtengeri + */ + +#include + +#include "chfs.h" + +/* tmp node operations */ +int chfs_check_td_data(struct chfs_mount *, + struct chfs_tmp_dnode *); +int chfs_check_td_node(struct chfs_mount *, + struct chfs_tmp_dnode *); +struct chfs_node_ref *chfs_first_valid_data_ref(struct chfs_node_ref *); +int chfs_add_tmp_dnode_to_tree(struct chfs_mount *, + struct chfs_readinode_info *, + struct chfs_tmp_dnode *); +void chfs_add_tmp_dnode_to_tdi(struct chfs_tmp_dnode_info *, + struct chfs_tmp_dnode *); +void chfs_remove_tmp_dnode_from_tdi(struct chfs_tmp_dnode_info *, + struct chfs_tmp_dnode *); +static void chfs_kill_td(struct chfs_mount *, + struct chfs_tmp_dnode *); +static void chfs_kill_tdi(struct chfs_mount *, + struct chfs_tmp_dnode_info *); +/* frag node operations */ +struct chfs_node_frag *new_fragment(struct chfs_full_dnode *, + uint32_t, + uint32_t); +int no_overlapping_node(struct rb_tree *, struct chfs_node_frag *, + struct chfs_node_frag *, uint32_t); +int chfs_add_frag_to_fragtree(struct chfs_mount *, + struct rb_tree *, + struct chfs_node_frag *); +void chfs_obsolete_node_frag(struct chfs_mount *, + struct chfs_node_frag *); +/* general node operations */ +int chfs_get_data_nodes(struct chfs_mount *, + struct chfs_inode *, + struct chfs_readinode_info *); +int chfs_build_fragtree(struct chfs_mount *, + struct chfs_inode *, + struct chfs_readinode_info *); + + + +/* + * -------------------------- + * tmp node rbtree operations + * -------------------------- + */ +static signed int +tmp_node_compare_nodes(void *ctx, const void *n1, const void *n2) +{ + const struct chfs_tmp_dnode_info *tdi1 = n1; + const struct chfs_tmp_dnode_info *tdi2 = n2; + + return (tdi1->tmpnode->node->ofs - tdi2->tmpnode->node->ofs); +} + +static signed int +tmp_node_compare_key(void *ctx, const void *n, const void *key) +{ + const struct chfs_tmp_dnode_info *tdi = n; + uint64_t ofs = *(const uint64_t *)key; + + return (tdi->tmpnode->node->ofs - ofs); +} + +const rb_tree_ops_t tmp_node_rbtree_ops = { + .rbto_compare_nodes = tmp_node_compare_nodes, + .rbto_compare_key = tmp_node_compare_key, + .rbto_node_offset = offsetof(struct chfs_tmp_dnode_info, rb_node), + .rbto_context = NULL +}; + + +/* + * --------------------------- + * frag node rbtree operations + * --------------------------- + */ +static signed int +frag_compare_nodes(void *ctx, const void *n1, const void *n2) +{ + const struct chfs_node_frag *frag1 = n1; + const struct chfs_node_frag *frag2 = n2; + + return (frag1->ofs - frag2->ofs); +} + +static signed int +frag_compare_key(void *ctx, const void *n, const void *key) +{ + const struct chfs_node_frag *frag = n; + uint64_t ofs = *(const uint64_t *)key; + + return (frag->ofs - ofs); +} + +const rb_tree_ops_t frag_rbtree_ops = { + .rbto_compare_nodes = frag_compare_nodes, + .rbto_compare_key = frag_compare_key, + .rbto_node_offset = offsetof(struct chfs_node_frag, rb_node), + .rbto_context = NULL +}; + + +/* + * ------------------- + * tmp node operations + * ------------------- + */ +/* + * Check the data CRC of the node. + * + * Returns: 0 - if everything OK; + * 1 - if CRC is incorrect; + * 2 - else; + * error code if an error occured. + */ +int +chfs_check_td_data(struct chfs_mount *chmp, + struct chfs_tmp_dnode *td) +{ + int err; + size_t retlen, len, totlen; + uint32_t crc; + uint64_t ofs; + char *buf; + struct chfs_node_ref *nref = td->node->nref; + + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + KASSERT(!mutex_owned(&chmp->chm_lock_sizes)); + + ofs = CHFS_GET_OFS(nref->nref_offset) + sizeof(struct chfs_flash_data_node); + len = td->node->size; + if (!len) + return 0; + + buf = kmem_alloc(len, KM_SLEEP); + if (!buf) { + dbg("allocating error\n"); + return 2; + } + err = chfs_read_leb(chmp, nref->nref_lnr, buf, ofs, len, &retlen); + if (err) { + dbg("error wile reading: %d\n", err); + err = 2; + goto out; + } + + if (len != retlen) { + dbg("len:%zu, retlen:%zu\n", len, retlen); + err = 2; + goto out; + } + crc = crc32(0, (uint8_t *)buf, len); + + if (crc != td->data_crc) { + dbg("crc failed, calculated: 0x%x, orig: 0x%x\n", crc, td->data_crc); + kmem_free(buf, len); + return 1; + } + + nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) | CHFS_NORMAL_NODE_MASK; + totlen = CHFS_PAD(sizeof(struct chfs_flash_data_node) + len); + + mutex_enter(&chmp->chm_lock_sizes); + chfs_change_size_unchecked(chmp, &chmp->chm_blocks[nref->nref_lnr], -totlen); + chfs_change_size_used(chmp, &chmp->chm_blocks[nref->nref_lnr], totlen); + mutex_exit(&chmp->chm_lock_sizes); + KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size); + + err = 0; +out: + kmem_free(buf, len); + return err; +} + +int +chfs_check_td_node(struct chfs_mount *chmp, struct chfs_tmp_dnode *td) +{ + int ret; + + if (CHFS_REF_FLAGS(td->node->nref) != CHFS_UNCHECKED_NODE_MASK) + return 0; + + ret = chfs_check_td_data(chmp, td); + if (ret == 1) { + chfs_mark_node_obsolete(chmp, td->node->nref); + } + return ret; +} + + +struct chfs_node_ref * +chfs_first_valid_data_ref(struct chfs_node_ref *nref) +{ + while (nref) { + if (!CHFS_REF_OBSOLETE(nref)) { +#ifdef DGB_MSG_GC + if (nref->nref_lnr == REF_EMPTY_NODE) { + dbg("FIRST VALID IS EMPTY!\n"); + } +#endif + return nref; + } + + if (nref->nref_next) { + nref = nref->nref_next; + } else + break; + } + return NULL; +} + +void +chfs_add_tmp_dnode_to_tdi(struct chfs_tmp_dnode_info *tdi, + struct chfs_tmp_dnode *td) +{ + if (!tdi->tmpnode) { + tdi->tmpnode = td; + } else { + struct chfs_tmp_dnode *tmp = tdi->tmpnode; + while (tmp->next) { + tmp = tmp->next; + } + tmp->next = td; + } +} + +void +chfs_remove_tmp_dnode_from_tdi(struct chfs_tmp_dnode_info *tdi, + struct chfs_tmp_dnode *td) +{ + if (tdi->tmpnode == td) { + tdi->tmpnode = tdi->tmpnode->next; + } else { + struct chfs_tmp_dnode *tmp = tdi->tmpnode->next; + while (tmp->next && tmp->next != td) { + tmp = tmp->next; + } + if (tmp->next) { + tmp->next = td->next; + } + } +} + +static void +chfs_kill_td(struct chfs_mount *chmp, + struct chfs_tmp_dnode *td) +{ + /* check if we need to mark as obsolete, to avoid double mark */ + if (!CHFS_REF_OBSOLETE(td->node->nref)) { + chfs_mark_node_obsolete(chmp, td->node->nref); + } + + chfs_free_tmp_dnode(td); +} + +static void +chfs_kill_tdi(struct chfs_mount *chmp, + struct chfs_tmp_dnode_info *tdi) +{ + struct chfs_tmp_dnode *next, *tmp = tdi->tmpnode; + + while (tmp) { + next = tmp->next; + chfs_kill_td(chmp, tmp); + tmp = next; + } + + chfs_free_tmp_dnode_info(tdi); +} + +int +chfs_add_tmp_dnode_to_tree(struct chfs_mount *chmp, + struct chfs_readinode_info *rii, + struct chfs_tmp_dnode *newtd) +{ + uint64_t end_ofs = newtd->node->ofs + newtd->node->size; + struct chfs_tmp_dnode_info *this; + struct rb_node *node, *prev_node; + struct chfs_tmp_dnode_info *newtdi; + + node = rb_tree_find_node(&rii->tdi_root, &newtd->node->ofs); + if (node) { + this = (struct chfs_tmp_dnode_info *)node; + while (this->tmpnode->overlapped) { + prev_node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_LEFT); + if (!prev_node) { + this->tmpnode->overlapped = 0; + break; + } + node = prev_node; + this = (struct chfs_tmp_dnode_info *)node; + } + } + while (node) { + this = (struct chfs_tmp_dnode_info *)node; + if (this->tmpnode->node->ofs > end_ofs) + break; + + struct chfs_tmp_dnode *tmp_td = this->tmpnode; + while (tmp_td) { + if (tmp_td->version == newtd->version) { + if (!chfs_check_td_node(chmp, tmp_td)) { + dbg("calling kill td 0\n"); + chfs_kill_td(chmp, newtd); + return 0; + } else { + chfs_remove_tmp_dnode_from_tdi(this, tmp_td); + chfs_kill_td(chmp, tmp_td); + chfs_add_tmp_dnode_to_tdi(this, newtd); + return 0; + } + } + if (tmp_td->version < newtd->version && + tmp_td->node->ofs >= newtd->node->ofs && + tmp_td->node->ofs + tmp_td->node->size <= end_ofs) { + /* New node entirely overlaps 'this' */ + if (chfs_check_td_node(chmp, newtd)) { + dbg("calling kill td 2\n"); + chfs_kill_td(chmp, newtd); + return 0; + } + /* ... and is good. Kill 'this' and any subsequent nodes which are also overlapped */ + while (tmp_td && tmp_td->node->ofs + tmp_td->node->size <= end_ofs) { + struct rb_node *next = rb_tree_iterate(&rii->tdi_root, this, RB_DIR_RIGHT); + struct chfs_tmp_dnode_info *next_tdi = (struct chfs_tmp_dnode_info *)next; + struct chfs_tmp_dnode *next_td = NULL; + if (tmp_td->next) { + next_td = tmp_td->next; + } else if (next_tdi) { + next_td = next_tdi->tmpnode; + } + if (tmp_td->version < newtd->version) { + chfs_remove_tmp_dnode_from_tdi(this, tmp_td); + chfs_kill_td(chmp, tmp_td); + if (!this->tmpnode) { + rb_tree_remove_node(&rii->tdi_root, this); + chfs_kill_tdi(chmp, this); + this = next_tdi; + } + } + tmp_td = next_td; + } + continue; + } + if (tmp_td->version > newtd->version && + tmp_td->node->ofs <= newtd->node->ofs && + tmp_td->node->ofs + tmp_td->node->size >= end_ofs) { + /* New node entirely overlapped by 'this' */ + if (!chfs_check_td_node(chmp, tmp_td)) { + dbg("this version: %llu\n", + (unsigned long long)tmp_td->version); + dbg("this ofs: %llu, size: %u\n", + (unsigned long long)tmp_td->node->ofs, + tmp_td->node->size); + dbg("calling kill td 4\n"); + chfs_kill_td(chmp, newtd); + return 0; + } + /* ... but 'this' was bad. Replace it... */ + chfs_remove_tmp_dnode_from_tdi(this, tmp_td); + chfs_kill_td(chmp, tmp_td); + if (!this->tmpnode) { + rb_tree_remove_node(&rii->tdi_root, this); + chfs_kill_tdi(chmp, this); + } + dbg("calling kill td 5\n"); + chfs_kill_td(chmp, newtd); + break; + } + tmp_td = tmp_td->next; + } + node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_RIGHT); + } + + newtdi = chfs_alloc_tmp_dnode_info(); + chfs_add_tmp_dnode_to_tdi(newtdi, newtd); + /* We neither completely obsoleted nor were completely + obsoleted by an earlier node. Insert into the tree */ + struct chfs_tmp_dnode_info *tmp_tdi = rb_tree_insert_node(&rii->tdi_root, newtdi); + if (tmp_tdi != newtdi) { + chfs_add_tmp_dnode_to_tdi(tmp_tdi, newtd); + newtdi->tmpnode = NULL; + chfs_kill_tdi(chmp, newtdi); + } + + /* If there's anything behind that overlaps us, note it */ + node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_LEFT); + if (node) { + while (1) { + this = (struct chfs_tmp_dnode_info *)node; + if (this->tmpnode->node->ofs + this->tmpnode->node->size > newtd->node->ofs) { + newtd->overlapped = 1; + } + if (!this->tmpnode->overlapped) + break; + + prev_node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_LEFT); + if (!prev_node) { + this->tmpnode->overlapped = 0; + break; + } + node = prev_node; + } + } + + /* If the new node overlaps anything ahead, note it */ + node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_RIGHT); + this = (struct chfs_tmp_dnode_info *)node; + while (this && this->tmpnode->node->ofs < end_ofs) { + this->tmpnode->overlapped = 1; + node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_RIGHT); + this = (struct chfs_tmp_dnode_info *)node; + } + return 0; +} + + +/* + * -------------------- + * frag node operations + * -------------------- + */ +struct chfs_node_frag * +new_fragment(struct chfs_full_dnode *fdn, uint32_t ofs, uint32_t size) +{ + struct chfs_node_frag *newfrag; + newfrag = chfs_alloc_node_frag(); + if (newfrag) { + newfrag->ofs = ofs; + newfrag->size = size; + newfrag->node = fdn; + } else { + chfs_err("cannot allocate a chfs_node_frag object\n"); + } + return newfrag; +} + +int +no_overlapping_node(struct rb_tree *fragtree, + struct chfs_node_frag *newfrag, + struct chfs_node_frag *this, uint32_t lastend) +{ + if (lastend < newfrag->node->ofs) { + struct chfs_node_frag *holefrag; + + holefrag = new_fragment(NULL, lastend, newfrag->node->ofs - lastend); + if (!holefrag) { + chfs_free_node_frag(newfrag); + return ENOMEM; + } + + rb_tree_insert_node(fragtree, holefrag); + this = holefrag; + } + + rb_tree_insert_node(fragtree, newfrag); + + return 0; +} + +int +chfs_add_frag_to_fragtree(struct chfs_mount *chmp, + struct rb_tree *fragtree, + struct chfs_node_frag *newfrag) +{ + struct chfs_node_frag *this; + uint32_t lastend; + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + this = (struct chfs_node_frag *)rb_tree_find_node_leq(fragtree, &newfrag->ofs); + + if (this) { + lastend = this->ofs + this->size; + } else { + lastend = 0; + } + + if (lastend <= newfrag->ofs) { + //dbg("no overlapping node\n"); + if (lastend && (lastend - 1) >> PAGE_SHIFT == newfrag->ofs >> PAGE_SHIFT) { + if (this->node) + CHFS_MARK_REF_NORMAL(this->node->nref); + CHFS_MARK_REF_NORMAL(newfrag->node->nref); + } + return no_overlapping_node(fragtree, newfrag, this, lastend); + } + + if (newfrag->ofs > this->ofs) { + + CHFS_MARK_REF_NORMAL(newfrag->node->nref); + if (this->node) + CHFS_MARK_REF_NORMAL(this->node->nref); + + if (this->ofs + this->size > newfrag->ofs + newfrag->size) { + /* newfrag is inside of this */ + //dbg("newfrag is inside of this\n"); + struct chfs_node_frag *newfrag2; + + newfrag2 = new_fragment(this->node, newfrag->ofs + newfrag->size, + this->ofs + this->size - newfrag->ofs - newfrag->size); + if (!newfrag2) + return ENOMEM; + if (this->node) + this->node->frags++; + + this->size = newfrag->ofs - this->ofs; + + rb_tree_insert_node(fragtree, newfrag); + rb_tree_insert_node(fragtree, newfrag2); + + return 0; + } + /* newfrag is bottom of this */ + //dbg("newfrag is bottom of this\n"); + this->size = newfrag->ofs - this->ofs; + rb_tree_insert_node(fragtree, newfrag); + } else { + /* newfrag start at same point */ + //dbg("newfrag start at same point\n"); + //TODO replace instead of remove and insert + rb_tree_remove_node(fragtree, this); + rb_tree_insert_node(fragtree, newfrag); + + if (newfrag->ofs + newfrag->size >= this->ofs+this->size) { + chfs_obsolete_node_frag(chmp, this); + } else { + this->ofs += newfrag->size; + this->size -= newfrag->size; + + rb_tree_insert_node(fragtree, this); + return 0; + } + } + /* OK, now we have newfrag added in the correct place in the tree, but + frag_next(newfrag) may be a fragment which is overlapped by it + */ + while ((this = frag_next(fragtree, newfrag)) && newfrag->ofs + newfrag->size >= this->ofs + this->size) { + rb_tree_remove_node(fragtree, this); + chfs_obsolete_node_frag(chmp, this); + } + + if (!this || newfrag->ofs + newfrag->size == this->ofs) + return 0; + + this->size = (this->ofs + this->size) - (newfrag->ofs + newfrag->size); + this->ofs = newfrag->ofs + newfrag->size; + + if (this->node) + CHFS_MARK_REF_NORMAL(this->node->nref); + CHFS_MARK_REF_NORMAL(newfrag->node->nref); + + return 0; +} + +void +chfs_kill_fragtree(struct rb_tree *fragtree) +{ + struct chfs_node_frag *this, *next; + //dbg("start\n"); + + this = (struct chfs_node_frag *)RB_TREE_MIN(fragtree); + while (this) { + //for (this = (struct chfs_node_frag *)RB_TREE_MIN(&fragtree); this != NULL; this = (struct chfs_node_frag *)rb_tree_iterate(&fragtree, &this->rb_node, RB_DIR_RIGHT)) { + next = frag_next(fragtree, this); + rb_tree_remove_node(fragtree, this); + chfs_free_node_frag(this); + //dbg("one frag killed\n"); + this = next; + } + //dbg("end\n"); +} + +uint32_t +chfs_truncate_fragtree(struct chfs_mount *chmp, + struct rb_tree *fragtree, uint32_t size) +{ + struct chfs_node_frag *frag; + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + dbg("truncate to size: %u\n", size); + + frag = (struct chfs_node_frag *)rb_tree_find_node_leq(fragtree, &size); + + /* Find the last frag before size and set its new size. */ + if (frag && frag->ofs != size) { + if (frag->ofs + frag->size > size) { + frag->size = size - frag->ofs; + } + frag = frag_next(fragtree, frag); + } + + /* Delete frags after new size. */ + while (frag && frag->ofs >= size) { + struct chfs_node_frag *next = frag_next(fragtree, frag); + + rb_tree_remove_node(fragtree, frag); + chfs_obsolete_node_frag(chmp, frag); + frag = next; + } + + if (size == 0) { + return 0; + } + + frag = frag_last(fragtree); + + if (!frag) { + return 0; + } + + if (frag->ofs + frag->size < size) { + return frag->ofs + frag->size; + } + + /* FIXME Should we check the postion of the last node? (PAGE_CACHE size, etc.) */ + if (frag->node && (frag->ofs & (PAGE_SIZE - 1)) == 0) { + frag->node->nref->nref_offset = CHFS_GET_OFS(frag->node->nref->nref_offset) | CHFS_PRISTINE_NODE_MASK; + } + + return size; +} + +void +chfs_obsolete_node_frag(struct chfs_mount *chmp, + struct chfs_node_frag *this) +{ + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + if (this->node) { + this->node->frags--; + if (!this->node->frags) { + struct chfs_vnode_cache *vc = chfs_nref_to_vc(this->node->nref); + chfs_mark_node_obsolete(chmp, this->node->nref); + + if (vc->dnode == this->node->nref) { + vc->dnode = this->node->nref->nref_next; + } else { + struct chfs_node_ref *tmp = vc->dnode; + while (tmp->nref_next != (struct chfs_node_ref*) vc + && tmp->nref_next != this->node->nref) { + tmp = tmp->nref_next; + } + if (tmp->nref_next == this->node->nref) { + tmp->nref_next = this->node->nref->nref_next; + } + // FIXME should we free here the this->node->nref? + } + + chfs_free_full_dnode(this->node); + } else { + CHFS_MARK_REF_NORMAL(this->node->nref); + } + } + chfs_free_node_frag(this); +} + +int +chfs_add_full_dnode_to_inode(struct chfs_mount *chmp, + struct chfs_inode *ip, + struct chfs_full_dnode *fd) +{ + int ret; + struct chfs_node_frag *newfrag; + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + if (unlikely(!fd->size)) + return 0; + + newfrag = new_fragment(fd, fd->ofs, fd->size); + if (unlikely(!newfrag)) + return ENOMEM; + + newfrag->node->frags = 1; + + ret = chfs_add_frag_to_fragtree(chmp, &ip->fragtree, newfrag); + if (ret) + return ret; + + if (newfrag->ofs & (PAGE_SIZE - 1)) { + struct chfs_node_frag *prev = frag_prev(&ip->fragtree, newfrag); + + CHFS_MARK_REF_NORMAL(fd->nref); + if (prev->node) + CHFS_MARK_REF_NORMAL(prev->node->nref); + } + + if ((newfrag->ofs+newfrag->size) & (PAGE_SIZE - 1)) { + struct chfs_node_frag *next = frag_next(&ip->fragtree, newfrag); + + if (next) { + CHFS_MARK_REF_NORMAL(fd->nref); + if (next->node) + CHFS_MARK_REF_NORMAL(next->node->nref); + } + } + + return 0; +} + + +/* + * ----------------------- + * general node operations + * ----------------------- + */ +/* get tmp nodes of an inode */ +int +chfs_get_data_nodes(struct chfs_mount *chmp, + struct chfs_inode *ip, + struct chfs_readinode_info *rii) +{ + uint32_t crc; + int err; + size_t len, retlen; + struct chfs_node_ref *nref; + struct chfs_flash_data_node *dnode; + struct chfs_tmp_dnode *td; + char* buf; + + len = sizeof(struct chfs_flash_data_node); + buf = kmem_alloc(len, KM_SLEEP); + + dnode = kmem_alloc(len, KM_SLEEP); + if (!dnode) + return ENOMEM; + + nref = chfs_first_valid_data_ref(ip->chvc->dnode); + + rii->highest_version = ip->chvc->highest_version; + + while(nref && (struct chfs_vnode_cache *)nref != ip->chvc) { + err = chfs_read_leb(chmp, nref->nref_lnr, buf, CHFS_GET_OFS(nref->nref_offset), len, &retlen); + if (err || len != retlen) + goto out; + dnode = (struct chfs_flash_data_node*)buf; + + //check header crc + crc = crc32(0, (uint8_t *)dnode, CHFS_NODE_HDR_SIZE - 4); + if (crc != le32toh(dnode->hdr_crc)) { + chfs_err("CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->hdr_crc)); + goto cont; + } + //check header magic bitmask + if (le16toh(dnode->magic) != CHFS_FS_MAGIC_BITMASK) { + chfs_err("Wrong magic bitmask.\n"); + goto cont; + } + //check node crc + crc = crc32(0, (uint8_t *)dnode, sizeof(*dnode) - 4); + if (crc != le32toh(dnode->node_crc)) { + chfs_err("Node CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->node_crc)); + goto cont; + } + td = chfs_alloc_tmp_dnode(); + if (!td) { + chfs_err("Can't allocate tmp dnode info.\n"); + err = ENOMEM; + goto out; + } + /* We don't check data crc here, just add nodes to tmp frag tree, because + * we don't want to check nodes which have been overlapped by a new node + * with a higher version number. + */ + td->node = chfs_alloc_full_dnode(); + if (!td->node) { + chfs_err("Can't allocate full dnode info.\n"); + err = ENOMEM; + goto out_tmp_dnode; + } + td->version = le64toh(dnode->version); + td->node->ofs = le64toh(dnode->offset); + td->data_crc = le32toh(dnode->data_crc); + td->node->nref = nref; + td->node->size = le32toh(dnode->data_length); + td->overlapped = 0; + + if (td->version > rii->highest_version) { + rii->highest_version = td->version; + } + + err = chfs_add_tmp_dnode_to_tree(chmp, rii, td); + if (err) + goto out_full_dnode; + +cont: + nref = chfs_first_valid_data_ref(nref->nref_next); + } + + ip->chvc->highest_version = rii->highest_version; + return 0; + +/* Exit points */ +out_full_dnode: + chfs_free_full_dnode(td->node); +out_tmp_dnode: + chfs_free_tmp_dnode(td); +out: + kmem_free(buf, len); + kmem_free(dnode, len); + return err; +} + + +/* Build final normal fragtree from tdi tree. */ +int +chfs_build_fragtree(struct chfs_mount *chmp, struct chfs_inode *ip, + struct chfs_readinode_info *rii) +{ + struct chfs_tmp_dnode_info *pen, *last, *this; + struct rb_tree ver_tree; /* version tree */ + uint64_t high_ver = 0; + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + rb_tree_init(&ver_tree, &tmp_node_rbtree_ops); + + if (rii->mdata_tn) { + high_ver = rii->mdata_tn->tmpnode->version; + rii->latest_ref = rii->mdata_tn->tmpnode->node->nref; + } + + pen = (struct chfs_tmp_dnode_info *)RB_TREE_MAX(&rii->tdi_root); + + while((last = pen)) { + pen = (struct chfs_tmp_dnode_info *)rb_tree_iterate(&rii->tdi_root, last, RB_DIR_LEFT); + + rb_tree_remove_node(&rii->tdi_root, last); + rb_tree_insert_node(&ver_tree, last); + + if (last->tmpnode->overlapped) { + if (pen) + continue; + + last->tmpnode->overlapped = 0; + } + + this = (struct chfs_tmp_dnode_info *)RB_TREE_MAX(&ver_tree); + + while (this) { + struct chfs_tmp_dnode_info *vers_next; + int ret; + + vers_next = (struct chfs_tmp_dnode_info *)rb_tree_iterate(&ver_tree, this, RB_DIR_LEFT); + rb_tree_remove_node(&ver_tree, this); + + struct chfs_tmp_dnode *tmp_td = this->tmpnode; + while (tmp_td) { + struct chfs_tmp_dnode *next_td = tmp_td->next; + + if (chfs_check_td_node(chmp, tmp_td)) { + if (next_td) { + chfs_remove_tmp_dnode_from_tdi(this, tmp_td); + } else { + break; + } + } else { + if (tmp_td->version > high_ver) { + high_ver = tmp_td->version; + dbg("highver: %llu\n", (unsigned long long)high_ver); + rii->latest_ref = tmp_td->node->nref; + } + + ret = chfs_add_full_dnode_to_inode(chmp, ip, tmp_td->node); + if (ret) { + while (1) { + vers_next = (struct chfs_tmp_dnode_info *)rb_tree_iterate(&ver_tree, this, RB_DIR_LEFT); + while (tmp_td) { + next_td = tmp_td->next; + if (chfs_check_td_node(chmp, tmp_td) > 1) { + chfs_mark_node_obsolete(chmp, + tmp_td->node->nref); + } + chfs_free_full_dnode(tmp_td->node); + chfs_remove_tmp_dnode_from_tdi(this, tmp_td); + chfs_free_tmp_dnode(tmp_td); + tmp_td = next_td; + } + chfs_free_tmp_dnode_info(this); + this = vers_next; + if (!this) + break; + rb_tree_remove_node(&ver_tree, vers_next); + } + return ret; + } + + chfs_remove_tmp_dnode_from_tdi(this, tmp_td); + chfs_free_tmp_dnode(tmp_td); + } + tmp_td = next_td; + } + chfs_kill_tdi(chmp, this); + this = vers_next; + } + } + + return 0; +} + +int chfs_read_inode(struct chfs_mount *chmp, struct chfs_inode *ip) +{ + struct chfs_vnode_cache *vc = ip->chvc; + + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + +retry: + /* XXX locking */ + //mutex_enter(&chmp->chm_lock_vnocache); + switch (vc->state) { + case VNO_STATE_UNCHECKED: + case VNO_STATE_CHECKEDABSENT: +// chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_READING); + vc->state = VNO_STATE_READING; + break; + case VNO_STATE_CHECKING: + case VNO_STATE_GC: + //sleep_on_spinunlock(&chmp->chm_lock_vnocache); + //KASSERT(!mutex_owned(&chmp->chm_lock_vnocache)); + goto retry; + break; + case VNO_STATE_PRESENT: + case VNO_STATE_READING: + chfs_err("Reading inode #%llu in state %d!\n", + (unsigned long long)vc->vno, vc->state); + chfs_err("wants to read a nonexistent ino %llu\n", + (unsigned long long)vc->vno); + return ENOENT; + default: + panic("BUG() Bad vno cache state."); + } + //mutex_exit(&chmp->chm_lock_vnocache); + + return chfs_read_inode_internal(chmp, ip); +} + +/* + * Read inode frags. + * Firstly get tmp nodes, + * secondly build fragtree from those. + */ +int +chfs_read_inode_internal(struct chfs_mount *chmp, struct chfs_inode *ip) +{ + int err; + size_t len, retlen; + char* buf; + struct chfs_readinode_info rii; + struct chfs_flash_vnode *fvnode; + + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + len = sizeof(*fvnode); + + memset(&rii, 0, sizeof(rii)); + + rb_tree_init(&rii.tdi_root, &tmp_node_rbtree_ops); + + /* build up a temp node frag tree */ + err = chfs_get_data_nodes(chmp, ip, &rii); + if (err) { + if (ip->chvc->state == VNO_STATE_READING) + ip->chvc->state = VNO_STATE_CHECKEDABSENT; + /* FIXME Should we kill fragtree or something here? */ + return err; + } + + rb_tree_init(&ip->fragtree, &frag_rbtree_ops); + /* + * build fragtree from temp nodes + */ + err = chfs_build_fragtree(chmp, ip, &rii); + if (err) { + if (ip->chvc->state == VNO_STATE_READING) + ip->chvc->state = VNO_STATE_CHECKEDABSENT; + /* FIXME Should we kill fragtree or something here? */ + return err; + } + + if (!rii.latest_ref) { + return 0; + } + + buf = kmem_alloc(len, KM_SLEEP); + if (!buf) + return ENOMEM; + + /* + * set inode size from chvc->v + */ + err = chfs_read_leb(chmp, ip->chvc->v->nref_lnr, buf, CHFS_GET_OFS(ip->chvc->v->nref_offset), len, &retlen); + if (err || retlen != len) { + kmem_free(buf, len); + return err?err:EIO; + } + + fvnode = (struct chfs_flash_vnode*)buf; + + dbg("set size from v: %u\n", fvnode->dn_size); + chfs_set_vnode_size(ITOV(ip), fvnode->dn_size); + uint32_t retsize = chfs_truncate_fragtree(chmp, &ip->fragtree, fvnode->dn_size); + if (retsize != fvnode->dn_size) { + dbg("Truncating failed. It is %u instead of %u\n", retsize, fvnode->dn_size); + } + + kmem_free(buf, len); + + if (ip->chvc->state == VNO_STATE_READING) { + ip->chvc->state = VNO_STATE_PRESENT; + } + + return 0; +} + +int +chfs_read_data(struct chfs_mount* chmp, struct vnode *vp, + struct buf *bp) +{ + off_t ofs; + struct chfs_node_frag *frag; + char * buf; + int err = 0; + size_t size, retlen; + uint32_t crc; + struct chfs_inode *ip = VTOI(vp); + struct chfs_flash_data_node *dnode; + struct chfs_node_ref *nref; + + memset(bp->b_data, 0, bp->b_bcount); + + ofs = bp->b_blkno * PAGE_SIZE; + frag = (struct chfs_node_frag *)rb_tree_find_node_leq(&ip->fragtree, &ofs); + + if (!frag || frag->ofs > ofs || frag->ofs + frag->size <= ofs) { + dbg("not found in frag tree\n"); + return 0; + } + + if (!frag->node) { + dbg("no node in frag\n"); + return 0; + } + + nref = frag->node->nref; + + size = sizeof(*dnode) + frag->size; + + buf = kmem_alloc(size, KM_SLEEP); + + dbg("reading from lnr: %u, offset: %u, size: %zu\n", nref->nref_lnr, CHFS_GET_OFS(nref->nref_offset), size); + err = chfs_read_leb(chmp, nref->nref_lnr, buf, CHFS_GET_OFS(nref->nref_offset), size, &retlen); + if (err) { + chfs_err("error after reading: %d\n", err); + goto out; + } + if (retlen != size) { + chfs_err("retlen: %zu != size: %zu\n", retlen, size); + err = EIO; + goto out; + } + + dnode = (struct chfs_flash_data_node *)buf; + crc = crc32(0, (uint8_t *)dnode, CHFS_NODE_HDR_SIZE - 4); + if (crc != le32toh(dnode->hdr_crc)) { + chfs_err("CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->hdr_crc)); + err = EIO; + goto out; + } + //check header magic bitmask + if (le16toh(dnode->magic) != CHFS_FS_MAGIC_BITMASK) { + chfs_err("Wrong magic bitmask.\n"); + err = EIO; + goto out; + } + //check node crc + crc = crc32(0, (uint8_t *)dnode, sizeof(*dnode) - 4); + if (crc != le32toh(dnode->node_crc)) { + chfs_err("Node CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->node_crc)); + err = EIO; + goto out; + } + crc = crc32(0, (uint8_t *)dnode->data, dnode->data_length); + if (crc != le32toh(dnode->data_crc)) { + chfs_err("Data CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->data_crc)); + err = EIO; + goto out; + } + + memcpy(bp->b_data, dnode->data, dnode->data_length); + bp->b_resid = 0; + +out: + kmem_free(buf, size); + return err; +} diff --git a/sys/ufs/chfs/chfs_scan.c b/sys/ufs/chfs/chfs_scan.c new file mode 100644 index 000000000..a35ce7215 --- /dev/null +++ b/sys/ufs/chfs/chfs_scan.c @@ -0,0 +1,740 @@ +/* $NetBSD: chfs_scan.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * Copyright (c) 2010 David Tengeri + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * chfs_scan.c + * + * Created on: 2009.11.05. + * Author: dtengeri + */ + +#include "chfs.h" + +/** + * chfs_scan_make_vnode_cache - makes a new vnode cache during scan + * @chmp: CHFS main descriptor structure + * @vno: vnode identifier + * This function returns a vnode cache belonging to @vno. + */ +struct chfs_vnode_cache * +chfs_scan_make_vnode_cache(struct chfs_mount *chmp, ino_t vno) +{ + struct chfs_vnode_cache *vc; + + KASSERT(mutex_owned(&chmp->chm_lock_vnocache)); + + vc = chfs_vnode_cache_get(chmp, vno); + if (vc) { + return vc; + } + + if (vno > chmp->chm_max_vno) { + chmp->chm_max_vno = vno; + } + + vc = chfs_vnode_cache_alloc(vno); + + //mutex_enter(&chmp->chm_lock_vnocache); + + chfs_vnode_cache_add(chmp, vc); + + //mutex_exit(&chmp->chm_lock_vnocache); + + if (vno == CHFS_ROOTINO) { + vc->nlink = 2; + vc->pvno = CHFS_ROOTINO; + chfs_vnode_cache_set_state(chmp, + vc, VNO_STATE_CHECKEDABSENT); + } + + return vc; +} + +/** + * chfs_scan_check_node_hdr - checks node magic and crc + * @nhdr: node header to check + * Returns 0 if everything is OK, error code otherwise. + */ +int +chfs_scan_check_node_hdr(struct chfs_flash_node_hdr *nhdr) +{ + uint16_t magic; + uint32_t crc, hdr_crc; + + magic = le16toh(nhdr->magic); + + if (magic != CHFS_FS_MAGIC_BITMASK) { + dbg("bad magic\n"); + return CHFS_NODE_BADMAGIC; + } + + hdr_crc = le32toh(nhdr->hdr_crc); + crc = crc32(0, (uint8_t *)nhdr, CHFS_NODE_HDR_SIZE - 4); + + if (crc != hdr_crc) { + dbg("bad crc\n"); + return CHFS_NODE_BADCRC; + } + + return CHFS_NODE_OK; +} + +/** + * chfs_scan_check_vnode - check vnode crc and add to vnode cache + * @chmp: CHFS main descriptor structure + * @cheb: eraseblock informations + * @buf: vnode to check + * @ofs: offset in eraseblock where vnode starts + */ +int +chfs_scan_check_vnode(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb, void *buf, off_t ofs) +{ + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + struct chfs_vnode_cache *vc; + struct chfs_flash_vnode *vnode = buf; + struct chfs_node_ref *nref; + int err; + uint32_t crc; + ino_t vno; + + crc = crc32(0, (uint8_t *)vnode, + sizeof(struct chfs_flash_vnode) - 4); + + if (crc != le32toh(vnode->node_crc)) { + err = chfs_update_eb_dirty(chmp, + cheb, le32toh(vnode->length)); + if (err) { + return err; + } + + return CHFS_NODE_BADCRC; + } + + vno = le64toh(vnode->vno); + + mutex_enter(&chmp->chm_lock_vnocache); + vc = chfs_vnode_cache_get(chmp, vno); + if (!vc) { + vc = chfs_scan_make_vnode_cache(chmp, vno); + if (!vc) { + mutex_exit(&chmp->chm_lock_vnocache); + return ENOMEM; + } + } + mutex_exit(&chmp->chm_lock_vnocache); + + nref = chfs_alloc_node_ref(cheb); + + nref->nref_offset = ofs; + + KASSERT(nref->nref_lnr == cheb->lnr); + + /* Check version of vnode. */ + if ((struct chfs_vnode_cache *)vc->v != vc) { + if (le64toh(vnode->version) > *vc->vno_version) { + //err = chfs_update_eb_dirty(chmp, &chmp->chm_blocks[vc->v->lnr], + // sizeof(struct chfs_flash_vnode)); + *vc->vno_version = le64toh(vnode->version); + chfs_add_vnode_ref_to_vc(chmp, vc, nref); + } else { + err = chfs_update_eb_dirty(chmp, cheb, + sizeof(struct chfs_flash_vnode)); + return CHFS_NODE_OK; + } + } else { + vc->vno_version = kmem_alloc(sizeof(uint64_t), KM_SLEEP); + if (!vc->vno_version) + return ENOMEM; + *vc->vno_version = le64toh(vnode->version); + chfs_add_vnode_ref_to_vc(chmp, vc, nref); + } + + mutex_enter(&chmp->chm_lock_sizes); + //dbg("B:lnr: %d |free_size: %d node's size: %d\n", cheb->lnr, cheb->free_size, le32toh(vnode->length)); + chfs_change_size_free(chmp, cheb, -le32toh(vnode->length)); + chfs_change_size_used(chmp, cheb, le32toh(vnode->length)); + mutex_exit(&chmp->chm_lock_sizes); + + KASSERT(cheb->used_size <= chmp->chm_ebh->eb_size); + + KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size); + + //dbg(" A: free_size: %d\n", cheb->free_size); + + /*dbg("vnode dump:\n"); + dbg(" ->magic: 0x%x\n", le16toh(vnode->magic)); + dbg(" ->type: %d\n", le16toh(vnode->type)); + dbg(" ->length: %d\n", le32toh(vnode->length)); + dbg(" ->hdr_crc: 0x%x\n", le32toh(vnode->hdr_crc)); + dbg(" ->vno: %d\n", le64toh(vnode->vno)); + dbg(" ->version: %ld\n", le64toh(vnode->version)); + dbg(" ->uid: %d\n", le16toh(vnode->uid)); + dbg(" ->gid: %d\n", le16toh(vnode->gid)); + dbg(" ->mode: %d\n", le32toh(vnode->mode)); + dbg(" ->dn_size: %d\n", le32toh(vnode->dn_size)); + dbg(" ->atime: %d\n", le32toh(vnode->atime)); + dbg(" ->mtime: %d\n", le32toh(vnode->mtime)); + dbg(" ->ctime: %d\n", le32toh(vnode->ctime)); + dbg(" ->dsize: %d\n", le32toh(vnode->dsize)); + dbg(" ->node_crc: 0x%x\n", le32toh(vnode->node_crc));*/ + + return CHFS_NODE_OK; +} + +int +chfs_scan_mark_dirent_obsolete(struct chfs_mount *chmp, + struct chfs_vnode_cache *vc, struct chfs_dirent *fd) +{ + //int size; + struct chfs_eraseblock *cheb; + struct chfs_node_ref *prev, *nref; + + nref = fd->nref; + cheb = &chmp->chm_blocks[fd->nref->nref_lnr]; + + /* Remove dirent's node ref from vnode cache */ + prev = vc->dirents; + if (prev && prev == nref) { + vc->dirents = prev->nref_next; + } else if (prev && prev != (void *)vc) { + while (prev->nref_next && prev->nref_next != + (void *)vc && prev->nref_next != nref) { + prev = prev->nref_next; + } + + if (prev->nref_next == nref) { + prev->nref_next = nref->nref_next; + } + } + /*dbg("XXX - start\n"); + //nref = vc->dirents; + struct chfs_dirent *tmp; + tmp = vc->scan_dirents; + while (tmp) { + dbg(" ->tmp->name: %s\n", tmp->name); + dbg(" ->tmp->version: %ld\n", tmp->version); + dbg(" ->tmp->vno: %d\n", tmp->vno); + tmp = tmp->next; + } + dbg("XXX - end\n");*/ + //size = CHFS_PAD(sizeof(struct chfs_flash_dirent_node) + fd->nsize); + + KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size + + cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size); + + return 0; +} + +void +chfs_add_fd_to_list(struct chfs_mount *chmp, + struct chfs_dirent *new, struct chfs_vnode_cache *pvc) +{ + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + int size; + struct chfs_eraseblock *cheb, *oldcheb; +// struct chfs_dirent **prev; + struct chfs_dirent *fd, *tmpfd; + + dbg("adding fd to list: %s\n", new->name); + + if ((new->version > pvc->highest_version)) + pvc->highest_version = new->version; + + size = CHFS_PAD(sizeof(struct chfs_flash_dirent_node) + + new->nsize); + cheb = &chmp->chm_blocks[new->nref->nref_lnr]; + + mutex_enter(&chmp->chm_lock_sizes); + TAILQ_FOREACH_SAFE(fd, &pvc->scan_dirents, fds, tmpfd) { + if (fd->nhash > new->nhash) { + /* insert new before fd */ + TAILQ_INSERT_BEFORE(fd, new, fds); + goto out; + } else if (fd->nhash == new->nhash && + !strcmp(fd->name, new->name)) { + if (new->version > fd->version) { +// new->next = fd->next; + /* replace fd with new */ + TAILQ_INSERT_BEFORE(fd, new, fds); + chfs_change_size_free(chmp, cheb, -size); + chfs_change_size_used(chmp, cheb, size); + + TAILQ_REMOVE(&pvc->scan_dirents, fd, fds); + if (fd->nref) { + size = CHFS_PAD(sizeof(struct chfs_flash_dirent_node) + fd->nsize); + chfs_scan_mark_dirent_obsolete(chmp, pvc, fd); + oldcheb = &chmp->chm_blocks[fd->nref->nref_lnr]; + chfs_change_size_used(chmp, oldcheb, -size); + chfs_change_size_dirty(chmp, oldcheb, size); + } + chfs_free_dirent(fd); +// *prev = new;//XXX + } else { + chfs_scan_mark_dirent_obsolete(chmp, pvc, new); + chfs_change_size_free(chmp, cheb, -size); + chfs_change_size_dirty(chmp, cheb, size); + chfs_free_dirent(new); + } + /*dbg("START\n"); + fd = pvc->scan_dirents; + while (fd) { + dbg("dirent dump:\n"); + dbg(" ->vno: %d\n", fd->vno); + dbg(" ->version: %ld\n", fd->version); + dbg(" ->nhash: 0x%x\n", fd->nhash); + dbg(" ->nsize: %d\n", fd->nsize); + dbg(" ->name: %s\n", fd->name); + dbg(" ->type: %d\n", fd->type); + fd = fd->next; + } + dbg("END\n");*/ + mutex_exit(&chmp->chm_lock_sizes); + return; + } + } + /* if we couldnt fit it elsewhere, lets add to the end */ + TAILQ_INSERT_TAIL(&pvc->scan_dirents, new, fds); + +out: + //dbg("B:lnr: %d |free_size: %d size: %d\n", cheb->lnr, cheb->free_size, size); + chfs_change_size_free(chmp, cheb, -size); + chfs_change_size_used(chmp, cheb, size); + mutex_exit(&chmp->chm_lock_sizes); + + KASSERT(cheb->used_size <= chmp->chm_ebh->eb_size); + //dbg(" A: free_size: %d\n", cheb->free_size); + + KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size); + + +// fd = pvc->scan_dirents; + /*dbg("START\n"); + while (fd) { + dbg("dirent dump:\n"); + dbg(" ->vno: %d\n", fd->vno); + dbg(" ->version: %ld\n", fd->version); + dbg(" ->nhash: 0x%x\n", fd->nhash); + dbg(" ->nsize: %d\n", fd->nsize); + dbg(" ->name: %s\n", fd->name); + dbg(" ->type: %d\n", fd->type); + fd = fd->next; + } + dbg("END\n");*/ +} +/** + * chfs_scan_check_dirent_node - check vnode crc and add to vnode cache + * @chmp: CHFS main descriptor structure + * @cheb: eraseblock informations + * @buf: directory entry to check + * @ofs: offset in eraseblock where dirent starts + */ +int +chfs_scan_check_dirent_node(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb, void *buf, off_t ofs) +{ + int err, namelen; + uint32_t crc; + struct chfs_dirent *fd; + struct chfs_vnode_cache *vc; + struct chfs_flash_dirent_node *dirent = buf; + + //struct chfs_node_ref *tmp; + + crc = crc32(0, (uint8_t *)dirent, sizeof(*dirent) - 4); + if (crc != le32toh(dirent->node_crc)) { + err = chfs_update_eb_dirty(chmp, cheb, le32toh(dirent->length)); + if (err) + return err; + return CHFS_NODE_BADCRC; + } + namelen = dirent->nsize; + + fd = chfs_alloc_dirent(namelen + 1); + if (!fd) + return ENOMEM; + + fd->nref = chfs_alloc_node_ref(cheb); + if (!fd->nref) + return ENOMEM; + + KASSERT(fd->nref->nref_lnr == cheb->lnr); + + memcpy(&fd->name, dirent->name, namelen); + fd->nsize = namelen; + fd->name[namelen] = 0; + crc = crc32(0, fd->name, dirent->nsize); + if (crc != le32toh(dirent->name_crc)) { + chfs_err("Directory entry's name has bad crc: read: 0x%x, " + "calculated: 0x%x\n", le32toh(dirent->name_crc), crc); + chfs_free_dirent(fd); + err = chfs_update_eb_dirty(chmp, cheb, le32toh(dirent->length)); + if (err) + return err; + return CHFS_NODE_BADNAMECRC; + } + + /* Check vnode_cache of parent node */ + mutex_enter(&chmp->chm_lock_vnocache); + vc = chfs_scan_make_vnode_cache(chmp, le64toh(dirent->pvno)); + mutex_exit(&chmp->chm_lock_vnocache); + if (!vc) { + chfs_free_dirent(fd); + return ENOMEM; + } + + fd->nref->nref_offset = ofs; + + dbg("add dirent to #%llu\n", (unsigned long long)vc->vno); + chfs_add_node_to_list(chmp, vc, fd->nref, &vc->dirents); + /*tmp = vc->dirents; + dbg("START|vno: %d dirents dump\n", vc->vno); + while (tmp) { + dbg(" ->nref->nref_lnr: %d\n", tmp->lnr); + dbg(" ->nref->nref_offset: %d\n", tmp->offset); + tmp = tmp->next; + } + dbg(" END|vno: %d dirents dump\n", vc->vno);*/ + +// fd->next = NULL; + fd->vno = le64toh(dirent->vno); + fd->version = le64toh(dirent->version); + fd->nhash = hash32_buf(fd->name, namelen, HASH32_BUF_INIT); + fd->type = dirent->dtype; + + /*dbg("dirent dump:\n"); + dbg(" ->vno: %d\n", fd->vno); + dbg(" ->version: %ld\n", fd->version); + dbg(" ->nhash: 0x%x\n", fd->nhash); + dbg(" ->nsize: %d\n", fd->nsize); + dbg(" ->name: %s\n", fd->name); + dbg(" ->type: %d\n", fd->type);*/ + + chfs_add_fd_to_list(chmp, fd, vc); + + /*struct chfs_node_ref *tmp; + tmp = vc->dirents; + dbg("START|vno: %d dirents dump\n", vc->vno); + while (tmp) { + dbg(" ->nref->nref_lnr: %d\n", tmp->lnr); + dbg(" ->nref->nref_offset: %d\n", tmp->offset); + tmp = tmp->next; + } + dbg(" END|vno: %d dirents dump\n", vc->vno);*/ + + /*dbg("dirent dump:\n"); + dbg(" ->magic: 0x%x\n", le16toh(dirent->magic)); + dbg(" ->type: %d\n", le16toh(dirent->type)); + dbg(" ->length: %d\n", le32toh(dirent->length)); + dbg(" ->hdr_crc: 0x%x\n", le32toh(dirent->hdr_crc)); + dbg(" ->vno: %d\n", le64toh(dirent->vno)); + dbg(" ->pvno: %d\n", le64toh(dirent->pvno)); + dbg(" ->version: %ld\n", le64toh(dirent->version)); + dbg(" ->mctime: %d\n", le32toh(dirent->mctime)); + dbg(" ->nsize: %d\n", dirent->nsize); + dbg(" ->dtype: %d\n", dirent->dtype); + dbg(" ->name_crc: 0x%x\n", le32toh(dirent->name_crc)); + dbg(" ->node_crc: 0x%x\n", le32toh(dirent->node_crc)); + dbg(" ->name: %s\n", dirent->name);*/ + + return CHFS_NODE_OK; +} + +/** + * chfs_scan_check_data_node - check vnode crc and add to vnode cache + * @chmp: CHFS main descriptor structure + * @cheb: eraseblock informations + * @buf: data node to check + * @ofs: offset in eraseblock where data node starts + */ +int +chfs_scan_check_data_node(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb, void *buf, off_t ofs) +{ + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + int err; + uint32_t crc, vno; + struct chfs_node_ref *nref; + struct chfs_vnode_cache *vc; + struct chfs_flash_data_node *dnode = buf; + + crc = crc32(0, (uint8_t *)dnode, sizeof(struct chfs_flash_data_node) - 4); + if (crc != le32toh(dnode->node_crc)) { + err = chfs_update_eb_dirty(chmp, cheb, le32toh(dnode->length)); + if (err) + return err; + return CHFS_NODE_BADCRC; + } + /** + * Don't check data nodes crc and version here, it will be done in + * the background GC thread. + */ + nref = chfs_alloc_node_ref(cheb); + if (!nref) + return ENOMEM; + + nref->nref_offset = ofs | CHFS_UNCHECKED_NODE_MASK; + + KASSERT(nref->nref_lnr == cheb->lnr); + + vno = le64toh(dnode->vno); + mutex_enter(&chmp->chm_lock_vnocache); + vc = chfs_vnode_cache_get(chmp, vno); + if (!vc) { + vc = chfs_scan_make_vnode_cache(chmp, vno); + if (!vc) + return ENOMEM; + } + mutex_exit(&chmp->chm_lock_vnocache); + chfs_add_node_to_list(chmp, vc, nref, &vc->dnode); + + dbg("chmpfree: %u, chebfree: %u, dnode: %u\n", chmp->chm_free_size, cheb->free_size, dnode->length); + + mutex_enter(&chmp->chm_lock_sizes); + chfs_change_size_free(chmp, cheb, -dnode->length); + chfs_change_size_unchecked(chmp, cheb, dnode->length); + mutex_exit(&chmp->chm_lock_sizes); + return CHFS_NODE_OK; +} + +/** + * chfs_scan_classify_cheb - determine eraseblock's state + * @chmp: CHFS main descriptor structure + * @cheb: eraseblock to classify + */ +int +chfs_scan_classify_cheb(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb) +{ + if (cheb->free_size == chmp->chm_ebh->eb_size) + return CHFS_BLK_STATE_FREE; + else if (cheb->dirty_size < MAX_DIRTY_TO_CLEAN) + return CHFS_BLK_STATE_CLEAN; + else if (cheb->used_size || cheb->unchecked_size) + return CHFS_BLK_STATE_PARTDIRTY; + else + return CHFS_BLK_STATE_ALLDIRTY; +} + + +/** + * chfs_scan_eraseblock - scans an eraseblock and looking for nodes + * @chmp: CHFS main descriptor structure + * @cheb: eraseblock to scan + * + * This function scans a whole eraseblock, checks the nodes on it and add them + * to the vnode cache. + * Returns eraseblock state on success, error code if fails. + */ +int +chfs_scan_eraseblock(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb) { + + int err; + size_t len, retlen; + off_t ofs = 0; + int lnr = cheb->lnr; + u_char *buf; + struct chfs_flash_node_hdr *nhdr; + int read_free = 0; + struct chfs_node_ref *nref; + + + dbg("scanning eraseblock content: %d free_size: %d\n", cheb->lnr, cheb->free_size); + dbg("scanned physical block: %d\n", chmp->chm_ebh->lmap[lnr]); + buf = kmem_alloc(CHFS_MAX_NODE_SIZE, KM_SLEEP); + + while((ofs + CHFS_NODE_HDR_SIZE) < chmp->chm_ebh->eb_size) { + memset(buf, 0 , CHFS_MAX_NODE_SIZE); + err = chfs_read_leb(chmp, + lnr, buf, ofs, CHFS_NODE_HDR_SIZE, &retlen); + if (err) { + return err; + } + + if (retlen != CHFS_NODE_HDR_SIZE) { + chfs_err("Error reading node header: " + "read: %zu instead of: %zu\n", + CHFS_NODE_HDR_SIZE, retlen); + return EIO; + } + + /* first we check if the buffer we read is full with 0xff, if yes maybe + * the blocks remaining area is free. We increase read_free and if it + * reaches MAX_READ_FREE we stop reading the block*/ + if (check_pattern(buf, 0xff, 0, CHFS_NODE_HDR_SIZE)) { + read_free += CHFS_NODE_HDR_SIZE; + if (read_free >= MAX_READ_FREE(chmp)) { + dbg("rest of the block is free. Size: %d\n", cheb->free_size); + return chfs_scan_classify_cheb(chmp, cheb); + } + ofs += CHFS_NODE_HDR_SIZE; + continue; + } else { + chfs_update_eb_dirty(chmp, cheb, read_free); + read_free = 0; + } + + nhdr = (struct chfs_flash_node_hdr *)buf; + + err = chfs_scan_check_node_hdr(nhdr); + if (err) { + dbg("node hdr error\n"); + err = chfs_update_eb_dirty(chmp, cheb, 4); + if (err) { + return err; + } + + ofs += 4; + continue; + } + ofs += CHFS_NODE_HDR_SIZE; + if (ofs > chmp->chm_ebh->eb_size) { + chfs_err("Second part of node is on the next eraseblock.\n"); + return EIO; + } + switch (le16toh(nhdr->type)) { + case CHFS_NODETYPE_VNODE: + /* Read up the node */ + //dbg("nodetype vnode\n"); + len = le32toh(nhdr->length) - CHFS_NODE_HDR_SIZE; + err = chfs_read_leb(chmp, + lnr, buf + CHFS_NODE_HDR_SIZE, + ofs, len, &retlen); + if (err) { + return err; + } + + if (retlen != len) { + chfs_err("Error reading vnode: read: %zu instead of: %zu\n", + len, retlen); + return EIO; + } + KASSERT(lnr == cheb->lnr); + err = chfs_scan_check_vnode(chmp, + cheb, buf, ofs - CHFS_NODE_HDR_SIZE); + if (err) { + return err; + } + + //dbg("XXX5end\n"); + break; + case CHFS_NODETYPE_DIRENT: + /* Read up the node */ + //dbg("nodetype dirent\n"); + len = le32toh(nhdr->length) - CHFS_NODE_HDR_SIZE; + + err = chfs_read_leb(chmp, + lnr, buf + CHFS_NODE_HDR_SIZE, + ofs, len, &retlen); + if (err) { + return err; + } + + if (retlen != len) { + chfs_err("Error reading dirent node: read: %zu " + "instead of: %zu\n", len, retlen); + return EIO; + } + + KASSERT(lnr == cheb->lnr); + + err = chfs_scan_check_dirent_node(chmp, + cheb, buf, ofs - CHFS_NODE_HDR_SIZE); + if (err) { + return err; + } + + //dbg("XXX6end\n"); + break; + case CHFS_NODETYPE_DATA: + //dbg("nodetype data\n"); + len = sizeof(struct chfs_flash_data_node) - + CHFS_NODE_HDR_SIZE; + err = chfs_read_leb(chmp, + lnr, buf + CHFS_NODE_HDR_SIZE, + ofs, len, &retlen); + if (err) { + return err; + } + + if (retlen != len) { + chfs_err("Error reading data node: read: %zu " + "instead of: %zu\n", len, retlen); + return EIO; + } + KASSERT(lnr == cheb->lnr); + err = chfs_scan_check_data_node(chmp, + cheb, buf, ofs - CHFS_NODE_HDR_SIZE); + if (err) + return err; + + //dbg("XXX7end\n"); + break; + case CHFS_NODETYPE_PADDING: + //dbg("nodetype padding\n"); + //dbg("padding len: %d\n", le32toh(nhdr->length)); + //dbg("BEF: cheb->free_size: %d\n", cheb->free_size); + nref = chfs_alloc_node_ref(cheb); + nref->nref_offset = ofs - CHFS_NODE_HDR_SIZE; + nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) | + CHFS_OBSOLETE_NODE_MASK; + + err = chfs_update_eb_dirty(chmp, cheb, + le32toh(nhdr->length)); + //dbg("AFT: cheb->free_size: %d\n", cheb->free_size); + if (err) + return err; + + //dbg("XXX8end\n"); + break; + default: + //dbg("nodetype ? (default)\n"); + /* Unknown node type, update dirty and skip */ + err = chfs_update_eb_dirty(chmp, cheb, + le32toh(nhdr->length)); + if (err) + return err; + + //dbg("XXX9end\n"); + break; + } + ofs += le32toh(nhdr->length) - CHFS_NODE_HDR_SIZE; + } + + KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size + + cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size); + + //dbg("XXX10\n"); + return chfs_scan_classify_cheb(chmp, cheb); +} diff --git a/sys/ufs/chfs/chfs_subr.c b/sys/ufs/chfs/chfs_subr.c new file mode 100644 index 000000000..00cd82f32 --- /dev/null +++ b/sys/ufs/chfs/chfs_subr.c @@ -0,0 +1,540 @@ +/* $NetBSD: chfs_subr.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * Copyright (C) 2010 Tamas Toth + * Copyright (C) 2010 Adam Hoka + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Efficient memory file system supporting functions. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include "chfs.h" +//#include +//#include + +/* --------------------------------------------------------------------- */ + +/* + * Returns information about the number of available memory pages, + * including physical and virtual ones. + * + * If 'total' is true, the value returned is the total amount of memory + * pages configured for the system (either in use or free). + * If it is FALSE, the value returned is the amount of free memory pages. + * + * Remember to remove DUMMYFS_PAGES_RESERVED from the returned value to avoid + * excessive memory usage. + * + */ +size_t +chfs_mem_info(bool total) +{ + size_t size; + + size = 0; + size += uvmexp.swpgavail; + if (!total) { + size -= uvmexp.swpgonly; + } + size += uvmexp.free; + size += uvmexp.filepages; + if (size > uvmexp.wired) { + size -= uvmexp.wired; + } else { + size = 0; + } + + return size; +} + + +/* --------------------------------------------------------------------- */ + +/* + * Looks for a directory entry in the directory represented by node. + * 'cnp' describes the name of the entry to look for. Note that the . + * and .. components are not allowed as they do not physically exist + * within directories. + * + * Returns a pointer to the entry when found, otherwise NULL. + */ +struct chfs_dirent * +chfs_dir_lookup(struct chfs_inode *ip, struct componentname *cnp) +{ + bool found; + struct chfs_dirent *fd; + dbg("dir_lookup()\n"); + + KASSERT(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); + KASSERT(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && + cnp->cn_nameptr[1] == '.'))); + //CHFS_VALIDATE_DIR(node); + + //node->chn_status |= CHFS_NODE_ACCESSED; + + found = false; +// fd = ip->dents; +// while(fd) { + TAILQ_FOREACH(fd, &ip->dents, fds) { + KASSERT(cnp->cn_namelen < 0xffff); + if (fd->vno == 0) + continue; + /*dbg("dirent dump:\n"); + dbg(" ->vno: %d\n", fd->vno); + dbg(" ->version: %ld\n", fd->version); + dbg(" ->nhash: 0x%x\n", fd->nhash); + dbg(" ->nsize: %d\n", fd->nsize); + dbg(" ->name: %s\n", fd->name); + dbg(" ->type: %d\n", fd->type);*/ + if (fd->nsize == (uint16_t)cnp->cn_namelen && + memcmp(fd->name, cnp->cn_nameptr, fd->nsize) == 0) { + found = true; + break; + } +// fd = fd->next; + } + + return found ? fd : NULL; +} + +/* --------------------------------------------------------------------- */ + +int +chfs_filldir(struct uio* uio, ino_t ino, const char *name, + int namelen, enum vtype type) +{ + struct dirent dent; + int error; + + memset(&dent, 0, sizeof(dent)); + + dent.d_fileno = ino; + switch (type) { + case VBLK: + dent.d_type = DT_BLK; + break; + + case VCHR: + dent.d_type = DT_CHR; + break; + + case VDIR: + dent.d_type = DT_DIR; + break; + + case VFIFO: + dent.d_type = DT_FIFO; + break; + + case VLNK: + dent.d_type = DT_LNK; + break; + + case VREG: + dent.d_type = DT_REG; + break; + + case VSOCK: + dent.d_type = DT_SOCK; + break; + + default: + KASSERT(0); + } + dent.d_namlen = namelen; + (void)memcpy(dent.d_name, name, dent.d_namlen); + dent.d_reclen = _DIRENT_SIZE(&dent); + + if (dent.d_reclen > uio->uio_resid) { + error = -1; + } else { + error = uiomove(&dent, dent.d_reclen, uio); + } + + return error; +} + + +/* --------------------------------------------------------------------- */ + +/* + * Change size of the given vnode. + * Caller should execute chfs_update on vp after a successful execution. + * The vnode must be locked on entry and remain locked on exit. + */ +int +chfs_chsize(struct vnode *vp, u_quad_t size, kauth_cred_t cred) +{ + struct chfs_mount *chmp; + struct chfs_inode *ip; + struct buf *bp; + int blknum, append; + int error = 0; + char *buf = NULL; + struct chfs_full_dnode *fd; + + ip = VTOI(vp); + chmp = ip->chmp; + + dbg("chfs_chsize\n"); + + switch (vp->v_type) { + case VDIR: + return EISDIR; + case VLNK: + case VREG: + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return EROFS; + break; + case VBLK: + case VCHR: + case VFIFO: + return 0; + default: + return EOPNOTSUPP; /* XXX why not ENODEV? */ + } + + vflushbuf(vp, 0); + + mutex_enter(&chmp->chm_lock_mountfields); + chfs_flush_pending_wbuf(chmp); + + /* handle truncate to zero as a special case */ + if (size == 0) { + dbg("truncate to zero"); + chfs_truncate_fragtree(ip->chmp, + &ip->fragtree, size); + chfs_set_vnode_size(vp, size); + + mutex_exit(&chmp->chm_lock_mountfields); + + return 0; + } + + + /* allocate zeros for the new data */ + buf = kmem_zalloc(size, KM_SLEEP); + bp = getiobuf(vp, true); + + if (ip->size != 0) { + /* read the whole data */ + bp->b_blkno = 0; + bp->b_bufsize = bp->b_resid = bp->b_bcount = ip->size; + bp->b_data = kmem_alloc(ip->size, KM_SLEEP); + + error = chfs_read_data(chmp, vp, bp); + if (error) { + mutex_exit(&chmp->chm_lock_mountfields); + putiobuf(bp); + + return error; + } + + /* create the new data */ + dbg("create new data vap%llu ip%llu\n", + (unsigned long long)size, (unsigned long long)ip->size); + append = size - ip->size; + if (append > 0) { + memcpy(buf, bp->b_data, ip->size); + } else { + memcpy(buf, bp->b_data, size); + chfs_truncate_fragtree(ip->chmp, + &ip->fragtree, size); + } + + kmem_free(bp->b_data, ip->size); + + struct chfs_node_frag *lastfrag = frag_last(&ip->fragtree); + fd = lastfrag->node; + chfs_mark_node_obsolete(chmp, fd->nref); + + blknum = lastfrag->ofs / PAGE_SIZE; + lastfrag->size = append > PAGE_SIZE ? PAGE_SIZE : size % PAGE_SIZE; + } else { + fd = chfs_alloc_full_dnode(); + blknum = 0; + } + + chfs_set_vnode_size(vp, size); + + // write the new data + for (bp->b_blkno = blknum; bp->b_blkno * PAGE_SIZE < size; bp->b_blkno++) { + uint64_t writesize = MIN(size - bp->b_blkno * PAGE_SIZE, PAGE_SIZE); + + bp->b_bufsize = bp->b_resid = bp->b_bcount = writesize; + bp->b_data = kmem_alloc(writesize, KM_SLEEP); + + memcpy(bp->b_data, buf + (bp->b_blkno * PAGE_SIZE), writesize); + + if (bp->b_blkno != blknum) { + fd = chfs_alloc_full_dnode(); + } + + error = chfs_write_flash_dnode(chmp, vp, bp, fd); + if (error) { + mutex_exit(&chmp->chm_lock_mountfields); + kmem_free(bp->b_data, writesize); + putiobuf(bp); + + return error; + } + if (bp->b_blkno != blknum) { + chfs_add_full_dnode_to_inode(chmp, ip, fd); + } + kmem_free(bp->b_data, writesize); + } + + mutex_exit(&chmp->chm_lock_mountfields); + + kmem_free(buf, size); + putiobuf(bp); + + return 0; +} +#if 0 + int error; + struct chfs_node *node; + + KASSERT(VOP_ISLOCKED(vp)); + + node = VP_TO_CHFS_NODE(vp); + + // Decide whether this is a valid operation based on the file type. + error = 0; + switch (vp->v_type) { + case VDIR: + return EISDIR; + + case VREG: + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return EROFS; + break; + + case VBLK: + case VCHR: + case VFIFO: + // Allow modifications of special files even if in the file + // system is mounted read-only (we are not modifying the + // files themselves, but the objects they represent). + return 0; + + default: + return ENODEV; + } + + // Immutable or append-only files cannot be modified, either. + if (node->chn_flags & (IMMUTABLE | APPEND)) + return EPERM; + + error = chfs_truncate(vp, size); + // chfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents + // for us, as will update dn_status; no need to do that here. + + KASSERT(VOP_ISLOCKED(vp)); + + return error; +#endif + +/* --------------------------------------------------------------------- */ + +/* + * Change flags of the given vnode. + * Caller should execute chfs_update on vp after a successful execution. + * The vnode must be locked on entry and remain locked on exit. + */ +int +chfs_chflags(struct vnode *vp, int flags, kauth_cred_t cred) +{ + struct chfs_mount *chmp; + struct chfs_inode *ip; + int error = 0; + + ip = VTOI(vp); + chmp = ip->chmp; + + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return EROFS; + + if (kauth_cred_geteuid(cred) != ip->uid && + (error = kauth_authorize_generic(cred, + KAUTH_GENERIC_ISSUSER, NULL))) + return error; + + if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, + NULL) == 0) { + if ((ip->flags & (SF_IMMUTABLE | SF_APPEND)) && + kauth_authorize_system(curlwp->l_cred, + KAUTH_SYSTEM_CHSYSFLAGS, 0, NULL, NULL, NULL)) + return EPERM; + + if ((flags & SF_SNAPSHOT) != + (ip->flags & SF_SNAPSHOT)) + return EPERM; + + ip->flags = flags; + } else { + if ((ip->flags & (SF_IMMUTABLE | SF_APPEND)) || + (flags & UF_SETTABLE) != flags) + return EPERM; + + if ((ip->flags & SF_SETTABLE) != + (flags & SF_SETTABLE)) + return EPERM; + + ip->flags &= SF_SETTABLE; + ip->flags |= (flags & UF_SETTABLE); + } + ip->iflag |= IN_CHANGE; + error = chfs_update(vp, NULL, NULL, UPDATE_WAIT); + if (error) + return error; + + if (flags & (IMMUTABLE | APPEND)) + return 0; + + return error; +} + +/* --------------------------------------------------------------------- */ + +void +chfs_itimes(struct chfs_inode *ip, const struct timespec *acc, + const struct timespec *mod, const struct timespec *cre) +{ + //dbg("itimes\n"); + struct timespec now; + + if (!(ip->iflag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) { + return; + } + + vfs_timestamp(&now); + if (ip->iflag & IN_ACCESS) { + if (acc == NULL) + acc = &now; + ip->atime = acc->tv_sec; + } + if (ip->iflag & (IN_UPDATE | IN_MODIFY)) { + if (mod == NULL) + mod = &now; + ip->mtime = mod->tv_sec; + //ip->i_modrev++; + } + if (ip->iflag & (IN_CHANGE | IN_MODIFY)) { + if (cre == NULL) + cre = &now; + ip->ctime = cre->tv_sec; + } + if (ip->iflag & (IN_ACCESS | IN_MODIFY)) + ip->iflag |= IN_ACCESSED; + if (ip->iflag & (IN_UPDATE | IN_CHANGE)) + ip->iflag |= IN_MODIFIED; + ip->iflag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY); +} + +/* --------------------------------------------------------------------- */ + +int +chfs_update(struct vnode *vp, const struct timespec *acc, + const struct timespec *mod, int flags) +{ + + struct chfs_inode *ip; + + /* XXX ufs_reclaim calls this function unlocked! */ +// KASSERT(VOP_ISLOCKED(vp)); + +#if 0 + if (flags & UPDATE_CLOSE) + ; /* XXX Need to do anything special? */ +#endif + + ip = VTOI(vp); + chfs_itimes(ip, acc, mod, NULL); + +// KASSERT(VOP_ISLOCKED(vp)); + return (0); +} + +/* --------------------------------------------------------------------- */ +/* + int + chfs_truncate(struct vnode *vp, off_t length) + { + bool extended; + int error; + struct chfs_node *node; + printf("CHFS: truncate()\n"); + + node = VP_TO_CHFS_NODE(vp); + extended = length > node->chn_size; + + if (length < 0) { + error = EINVAL; + goto out; + } + + if (node->chn_size == length) { + error = 0; + goto out; + } + + error = chfs_reg_resize(vp, length); + if (error == 0) + node->chn_status |= CHFS_NODE_CHANGED | CHFS_NODE_MODIFIED; + + out: + chfs_update(vp, NULL, NULL, 0); + + return error; + }*/ + + diff --git a/sys/ufs/chfs/chfs_vfsops.c b/sys/ufs/chfs/chfs_vfsops.c new file mode 100644 index 000000000..a08d15fb7 --- /dev/null +++ b/sys/ufs/chfs/chfs_vfsops.c @@ -0,0 +1,847 @@ +/* $NetBSD: chfs_vfsops.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * Copyright (C) 2010 Tamas Toth + * Copyright (C) 2010 Adam Hoka + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//XXX needed just for debugging +#include +#include +#include +#include + +#include +#include +#include +//#include +#include +#include +#include +#include +//#include +//#include +#include "chfs.h" +#include "chfs_args.h" + +MODULE(MODULE_CLASS_VFS, chfs, "flash"); + +/* --------------------------------------------------------------------- */ +/* functions */ + +static int chfs_mount(struct mount *, const char *, void *, size_t *); +static int chfs_unmount(struct mount *, int); +static int chfs_root(struct mount *, struct vnode **); +static int chfs_vget(struct mount *, ino_t, struct vnode **); +static int chfs_fhtovp(struct mount *, struct fid *, struct vnode **); +static int chfs_vptofh(struct vnode *, struct fid *, size_t *); +static int chfs_start(struct mount *, int); +static int chfs_statvfs(struct mount *, struct statvfs *); +static int chfs_sync(struct mount *, int, kauth_cred_t); +static void chfs_init(void); +static void chfs_reinit(void); +static void chfs_done(void); +static int chfs_snapshot(struct mount *, struct vnode *, + struct timespec *); + +/* --------------------------------------------------------------------- */ +/* structures */ + +int +chfs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags, + kauth_cred_t cred) +{ + return (0); +} + +const struct genfs_ops chfs_genfsops = { + .gop_size = genfs_size, + .gop_alloc = chfs_gop_alloc, + .gop_write = genfs_gop_write, + .gop_markupdate = ufs_gop_markupdate, +}; + +/* +static const struct ufs_ops chfs_ufsops = { + .uo_itimes = chfs_itimes, + .uo_update = chfs_update, +}; +*/ + +struct pool chfs_inode_pool; + +/* for looking up the major for flash */ +extern const struct cdevsw flash_cdevsw; + +/* --------------------------------------------------------------------- */ + +static int +chfs_mount(struct mount *mp, + const char *path, void *data, size_t *data_len) +{ + struct lwp *l = curlwp; + struct nameidata nd; + struct pathbuf *pb; + struct vnode *devvp = NULL; + struct ufs_args *args = data; + struct ufsmount *ump = NULL; + struct chfs_mount *chmp; + int err = 0; + int xflags; + + dbg("mount()\n"); + + if (*data_len < sizeof *args) + return EINVAL; + + if (mp->mnt_flag & MNT_GETARGS) { + ump = VFSTOUFS(mp); + if (ump == NULL) + return EIO; + memset(args, 0, sizeof *args); + args->fspec = NULL; + *data_len = sizeof *args; + return 0; + } + + if (mp->mnt_flag & MNT_UPDATE) { + /* XXX: There is no support yet to update file system + * settings. Should be added. */ + + return ENODEV; + } + + if (args->fspec != NULL) { + err = pathbuf_copyin(args->fspec, &pb); + if (err) { + return err; + } + /* + * Look up the name and verify that it's sane. + */ + NDINIT(&nd, LOOKUP, FOLLOW, pb); + if ((err = namei(&nd)) != 0 ) + return (err); + devvp = nd.ni_vp; + + /* + * Be sure this is a valid block device + */ + if (devvp->v_type != VBLK) + err = ENOTBLK; + else if (bdevsw_lookup(devvp->v_rdev) == NULL) + err = ENXIO; + } + + if (err) { + vrele(devvp); + return (err); + } + + if (mp->mnt_flag & MNT_RDONLY) + xflags = FREAD; + else + xflags = FREAD|FWRITE; + + err = VOP_OPEN(devvp, xflags, FSCRED); + if (err) + goto fail; + + + err = chfs_mountfs(devvp, mp); + if (err) { + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + (void)VOP_CLOSE(devvp, xflags, NOCRED); + VOP_UNLOCK(devvp); + goto fail; + } + ump = VFSTOUFS(mp); + chmp = ump->um_chfs; + + vfs_getnewfsid(mp); + chmp->chm_fsmp = mp; + + return set_statvfs_info(path, + UIO_USERSPACE, args->fspec, + UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); + +fail: + vrele(devvp); + return (err); +} + + +int +chfs_mountfs(struct vnode *devvp, struct mount *mp) +{ + struct lwp *l = curlwp; + struct proc *p; + kauth_cred_t cred; + devmajor_t flash_major; + dev_t dev; + struct ufsmount* ump = NULL; + struct chfs_mount* chmp; + struct vnode *vp; + int err = 0; + + dbg("mountfs()\n"); + + dev = devvp->v_rdev; + p = l ? l->l_proc : NULL; + cred = l ? l->l_cred : NOCRED; + + /* Flush out any old buffers remaining from a previous use. */ + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + err = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0); + VOP_UNLOCK(devvp); + if (err) + return (err); + + flash_major = cdevsw_lookup_major(&flash_cdevsw); + + if (devvp->v_type != VBLK) + err = ENOTBLK; + else if (bdevsw_lookup(dev) == NULL) + err = ENXIO; + else if (major(dev) != flash_major) { + dbg("major(dev): %d, flash_major: %d\n", + major(dev), flash_major); + err = ENODEV; + } + if (err) { + vrele(devvp); + return (err); + } + + ump = malloc(sizeof(*ump), M_UFSMNT, M_WAITOK); + memset(ump, 0, sizeof(*ump)); + ump->um_fstype = UFS1; + //ump->um_ops = &chfs_ufsops; + ump->um_chfs = malloc(sizeof(struct chfs_mount), + M_UFSMNT, M_WAITOK); + memset(ump->um_chfs, 0, sizeof(struct chfs_mount)); + + mutex_init(&ump->um_lock, MUTEX_DEFAULT, IPL_NONE); + + /* Get superblock and set flash device number */ + chmp = ump->um_chfs; + if (!chmp) + return ENOMEM; + + chmp->chm_ebh = kmem_alloc(sizeof(struct chfs_ebh), KM_SLEEP); + + dbg("[]opening flash: %u\n", (unsigned int)devvp->v_rdev); + err = ebh_open(chmp->chm_ebh, devvp->v_rdev); + if (err) { + dbg("error while opening flash\n"); + kmem_free(chmp->chm_ebh, sizeof(struct chfs_ebh)); + free(chmp, M_UFSMNT); + return err; + } + + //TODO check flash sizes + + chmp->chm_gbl_version = 0; + chmp->chm_vnocache_hash = chfs_vnocache_hash_init(); + + chmp->chm_blocks = kmem_zalloc(chmp->chm_ebh->peb_nr * + sizeof(struct chfs_eraseblock), KM_SLEEP); + + if (!chmp->chm_blocks) { + kmem_free(chmp->chm_ebh, chmp->chm_ebh->peb_nr * + sizeof(struct chfs_eraseblock)); + ebh_close(chmp->chm_ebh); + free(chmp, M_UFSMNT); + return ENOMEM; + } + + mutex_init(&chmp->chm_lock_mountfields, MUTEX_DEFAULT, IPL_NONE); + mutex_init(&chmp->chm_lock_sizes, MUTEX_DEFAULT, IPL_NONE); + mutex_init(&chmp->chm_lock_vnocache, MUTEX_DEFAULT, IPL_NONE); + + //XXX + chmp->chm_fs_bmask = -4096; + chmp->chm_fs_bsize = 4096; + chmp->chm_fs_qbmask = 4095; + chmp->chm_fs_bshift = 12; + chmp->chm_fs_fmask = -2048; + chmp->chm_fs_qfmask = 2047; + + chmp->chm_wbuf_pagesize = chmp->chm_ebh->flash_if->page_size; + dbg("wbuf size: %zu\n", chmp->chm_wbuf_pagesize); + chmp->chm_wbuf = kmem_alloc(chmp->chm_wbuf_pagesize, KM_SLEEP); + rw_init(&chmp->chm_lock_wbuf); + + //init queues + TAILQ_INIT(&chmp->chm_free_queue); + TAILQ_INIT(&chmp->chm_clean_queue); + TAILQ_INIT(&chmp->chm_dirty_queue); + TAILQ_INIT(&chmp->chm_very_dirty_queue); + TAILQ_INIT(&chmp->chm_erasable_pending_wbuf_queue); + TAILQ_INIT(&chmp->chm_erase_pending_queue); + + chfs_calc_trigger_levels(chmp); + + chmp->chm_nr_free_blocks = 0; + chmp->chm_nr_erasable_blocks = 0; + chmp->chm_max_vno = 2; + chmp->chm_checked_vno = 2; + chmp->chm_unchecked_size = 0; + chmp->chm_used_size = 0; + chmp->chm_dirty_size = 0; + chmp->chm_wasted_size = 0; + chmp->chm_free_size = chmp->chm_ebh->eb_size * chmp->chm_ebh->peb_nr; + err = chfs_build_filesystem(chmp); + + if (err) { + chfs_vnocache_hash_destroy(chmp->chm_vnocache_hash); + kmem_free(chmp->chm_ebh, chmp->chm_ebh->peb_nr * + sizeof(struct chfs_eraseblock)); + ebh_close(chmp->chm_ebh); + free(chmp, M_UFSMNT); + return EIO; + } + + mp->mnt_data = ump; + mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev; + mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_CHFS); + mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; + mp->mnt_stat.f_namemax = MAXNAMLEN; + mp->mnt_flag |= MNT_LOCAL; + mp->mnt_fs_bshift = PAGE_SHIFT; + mp->mnt_dev_bshift = DEV_BSHIFT; + mp->mnt_iflag |= IMNT_MPSAFE; + ump->um_flags = 0; + ump->um_mountp = mp; + ump->um_dev = dev; + ump->um_devvp = devvp; + ump->um_maxfilesize = 1048512 * 1024; + /*TODO fill these fields + ump->um_nindir = + ump->um_lognindir = + ump->um_bptrtodb = + ump->um_seqinc = + ump->um_maxsymlinklen = + ump->um_dirblksiz = + ump->um_maxfilesize = + */ + + /* + * Allocate the root vnode. + */ + err = VFS_VGET(mp, CHFS_ROOTINO, &vp); + if (err) { + dbg("error: %d while allocating root node\n", err); + return err; + } + vput(vp); + + chfs_gc_thread_start(chmp); + mutex_enter(&chmp->chm_lock_mountfields); + chfs_gc_trigger(chmp); + mutex_exit(&chmp->chm_lock_mountfields); + + devvp->v_specmountpoint = mp; + return 0; +} + +/* --------------------------------------------------------------------- */ + +/* ARGSUSED2 */ +static int +chfs_unmount(struct mount *mp, int mntflags) +{ + int flags = 0, i = 0; + struct ufsmount *ump; + struct chfs_mount *chmp; +// struct chfs_vnode_cache *vc, *next; + + if (mntflags & MNT_FORCE) + flags |= FORCECLOSE; + + dbg("[START]\n"); + + ump = VFSTOUFS(mp); + chmp = ump->um_chfs; + + chfs_gc_thread_stop(chmp); + + (void)vflush(mp, NULLVP, flags); + + if (chmp->chm_wbuf_len) { + mutex_enter(&chmp->chm_lock_mountfields); + chfs_flush_pending_wbuf(chmp); + mutex_exit(&chmp->chm_lock_mountfields); + } + + for (i = 0; i < chmp->chm_ebh->peb_nr; i++) { + chfs_free_node_refs(&chmp->chm_blocks[i]); + } + + chfs_vnocache_hash_destroy(chmp->chm_vnocache_hash); + + ebh_close(chmp->chm_ebh); + + rw_destroy(&chmp->chm_lock_wbuf); + mutex_destroy(&chmp->chm_lock_vnocache); + mutex_destroy(&chmp->chm_lock_sizes); + mutex_destroy(&chmp->chm_lock_mountfields); + + if (ump->um_devvp->v_type != VBAD) { + ump->um_devvp->v_specmountpoint = NULL; + } + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); + (void)VOP_CLOSE(ump->um_devvp, FREAD|FWRITE, NOCRED); + vput(ump->um_devvp); + + mutex_destroy(&ump->um_lock); + + //free(ump->um_chfs, M_UFSMNT); + free(ump, M_UFSMNT); + mp->mnt_data = NULL; + mp->mnt_flag &= ~MNT_LOCAL; + dbg("[END]\n"); + return (0); +} + +/* --------------------------------------------------------------------- */ + +static int +chfs_root(struct mount *mp, struct vnode **vpp) +{ + struct vnode *vp; + int error; + + if ((error = VFS_VGET(mp, (ino_t)ROOTINO, &vp)) != 0) + return error; + *vpp = vp; + return 0; +} + +/* --------------------------------------------------------------------- */ + +extern rb_tree_ops_t frag_rbtree_ops; + +static int +chfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp) +{ + struct chfs_mount *chmp; + struct chfs_inode *ip; + struct ufsmount *ump; + struct vnode *vp; + dev_t dev; + int error; + struct chfs_vnode_cache* chvc = NULL; + struct chfs_node_ref* nref = NULL; + struct buf *bp; + + dbg("vget() | ino: %llu\n", (unsigned long long)ino); + + ump = VFSTOUFS(mp); + dev = ump->um_dev; +retry: + if (!vpp) { + vpp = kmem_alloc(sizeof(struct vnode*), KM_SLEEP); + } + + if ((*vpp = chfs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL) { + return 0; + } + + /* Allocate a new vnode/inode. */ + if ((error = getnewvnode(VT_CHFS, + mp, chfs_vnodeop_p, NULL, &vp)) != 0) { + *vpp = NULL; + return (error); + } + ip = pool_get(&chfs_inode_pool, PR_WAITOK); + + mutex_enter(&chfs_hashlock); + if ((*vpp = chfs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL) { + mutex_exit(&chfs_hashlock); + ungetnewvnode(vp); + pool_put(&chfs_inode_pool, ip); + goto retry; + } + + vp->v_vflag |= VV_LOCKSWORK; + + memset(ip, 0, sizeof(*ip)); + vp->v_data = ip; + ip->vp = vp; + ip->ump = ump; + ip->chmp = chmp = ump->um_chfs; + ip->dev = dev; + ip->ino = ino; + vp->v_mount = mp; + genfs_node_init(vp, &chfs_genfsops); + + rb_tree_init(&ip->fragtree, &frag_rbtree_ops); + //mutex_init(&ip->inode_lock, MUTEX_DEFAULT, IPL_NONE); + + chfs_ihashins(ip); + mutex_exit(&chfs_hashlock); + + // set root inode + if (ino == CHFS_ROOTINO) { + dbg("SETROOT\n"); + vp->v_vflag |= VV_ROOT; + vp->v_type = VDIR; + ip->mode = IFMT | IEXEC | IWRITE | IREAD; + ip->iflag |= (IN_ACCESS | IN_CHANGE | IN_UPDATE); + chfs_update(vp, NULL, NULL, UPDATE_WAIT); +// ip->dents = NULL; XXXTAILQ + TAILQ_INIT(&ip->dents); + chfs_set_vnode_size(vp, 512); + } + + // set vnode cache + mutex_enter(&chmp->chm_lock_vnocache); + chvc = chfs_vnode_cache_get(chmp, ino); + mutex_exit(&chmp->chm_lock_vnocache); + if (!chvc) { + dbg("!chvc\n"); + /* XXX, we cant alloc under a lock, refactor this! */ + chvc = chfs_vnode_cache_alloc(ino); + mutex_enter(&chmp->chm_lock_vnocache); + if (ino == CHFS_ROOTINO) { + chvc->nlink = 2; + chvc->pvno = CHFS_ROOTINO; + chfs_vnode_cache_set_state(chmp, + chvc, VNO_STATE_CHECKEDABSENT); + } + chfs_vnode_cache_add(chmp, chvc); + mutex_exit(&chmp->chm_lock_vnocache); + + ip->chvc = chvc; + TAILQ_INIT(&ip->dents); + } else { + dbg("chvc\n"); + ip->chvc = chvc; + // if we have a vnode cache, the node is already on flash, so read it + if (ino == CHFS_ROOTINO) { + chvc->pvno = CHFS_ROOTINO; + TAILQ_INIT(&chvc->scan_dirents); + } else { + chfs_readvnode(mp, ino, &vp); + } + + mutex_enter(&chmp->chm_lock_mountfields); + // init type specific things + switch (vp->v_type) { + case VDIR: + nref = chvc->dirents; + while (nref && + (struct chfs_vnode_cache *)nref != chvc) { + chfs_readdirent(mp, nref, ip); + nref = nref->nref_next; + } + chfs_set_vnode_size(vp, 512); + break; + case VREG: + case VSOCK: + //build the fragtree of the vnode + dbg("read_inode_internal | ino: %llu\n", + (unsigned long long)ip->ino); + error = chfs_read_inode(chmp, ip); + if (error) { + vput(vp); + *vpp = NULL; + mutex_exit(&chmp->chm_lock_mountfields); + return (error); + } + break; + case VLNK: + //build the fragtree of the vnode + dbg("read_inode_internal | ino: %llu\n", + (unsigned long long)ip->ino); + error = chfs_read_inode_internal(chmp, ip); + if (error) { + vput(vp); + *vpp = NULL; + mutex_exit(&chmp->chm_lock_mountfields); + return (error); + } + + dbg("size: %llu\n", (unsigned long long)ip->size); + bp = getiobuf(vp, true); + bp->b_blkno = 0; + bp->b_bufsize = bp->b_resid = + bp->b_bcount = ip->size; + bp->b_data = kmem_alloc(ip->size, KM_SLEEP); + chfs_read_data(chmp, vp, bp); + if (!ip->target) + ip->target = kmem_alloc(ip->size, + KM_SLEEP); + memcpy(ip->target, bp->b_data, ip->size); + kmem_free(bp->b_data, ip->size); + putiobuf(bp); + + break; + case VCHR: + case VBLK: + case VFIFO: + //build the fragtree of the vnode + dbg("read_inode_internal | ino: %llu\n", + (unsigned long long)ip->ino); + error = chfs_read_inode_internal(chmp, ip); + if (error) { + vput(vp); + *vpp = NULL; + mutex_exit(&chmp->chm_lock_mountfields); + return (error); + } + + bp = getiobuf(vp, true); + bp->b_blkno = 0; + bp->b_bufsize = bp->b_resid = + bp->b_bcount = sizeof(dev_t); + bp->b_data = kmem_alloc(sizeof(dev_t), KM_SLEEP); + chfs_read_data(chmp, vp, bp); + memcpy(&ip->rdev, + bp->b_data, sizeof(dev_t)); + kmem_free(bp->b_data, sizeof(dev_t)); + putiobuf(bp); + if (vp->v_type == VFIFO) + vp->v_op = chfs_fifoop_p; + else { + vp->v_op = chfs_specop_p; + spec_node_init(vp, ip->rdev); + } + + break; + case VNON: + case VBAD: + break; + } + mutex_exit(&chmp->chm_lock_mountfields); + + } + + /* finish inode initalization */ + ip->devvp = ump->um_devvp; + vref(ip->devvp); + + uvm_vnp_setsize(vp, ip->size); + *vpp = vp; + + return 0; +} + +/* --------------------------------------------------------------------- */ + +static int +chfs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp) +{ + return ENODEV; +} + +/* --------------------------------------------------------------------- */ + +static int +chfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size) +{ + return ENODEV; +} + +/* --------------------------------------------------------------------- */ + +static int +chfs_start(struct mount *mp, int flags) +{ + return 0; +} + +/* --------------------------------------------------------------------- */ + +/* ARGSUSED2 */ +static int +chfs_statvfs(struct mount *mp, struct statvfs *sbp) +{ + struct chfs_mount *chmp; + struct ufsmount *ump; + dbg("statvfs\n"); + + ump = VFSTOUFS(mp); + chmp = ump->um_chfs; + + sbp->f_flag = mp->mnt_flag; + sbp->f_bsize = chmp->chm_ebh->eb_size; + sbp->f_frsize = chmp->chm_ebh->eb_size; + sbp->f_iosize = chmp->chm_ebh->eb_size; + + sbp->f_blocks = chmp->chm_ebh->peb_nr; + sbp->f_files = 0; + sbp->f_bavail = chmp->chm_nr_free_blocks - chmp->chm_resv_blocks_write; +#if 0 + printf("chmp->chm_nr_free_blocks: %jd\n", + (intmax_t )chmp->chm_nr_free_blocks); + printf("chmp->chm_resv_blocks_write: %jd\n", + (intmax_t) chmp->chm_resv_blocks_write); + printf("chmp->chm_ebh->peb_nr: %jd\n", + (intmax_t) chmp->chm_ebh->peb_nr); +#endif + + sbp->f_bfree = chmp->chm_nr_free_blocks; + sbp->f_bresvd = chmp->chm_resv_blocks_write; + + /* FFS specific */ + sbp->f_ffree = 0; + sbp->f_favail = 0; + sbp->f_fresvd = 0; + + copy_statvfs_info(sbp, mp); + + return 0; +} + +/* --------------------------------------------------------------------- */ + +/* ARGSUSED0 */ +static int +chfs_sync(struct mount *mp, int waitfor, + kauth_cred_t uc) +{ + return 0; +} + +/* --------------------------------------------------------------------- */ + +static void +chfs_init(void) +{ + chfs_alloc_pool_caches(); + chfs_ihashinit(); + pool_init(&chfs_inode_pool, sizeof(struct chfs_inode), 0, 0, 0, + "chfsinopl", &pool_allocator_nointr, IPL_NONE); + ufs_init(); +} + +/* --------------------------------------------------------------------- */ + +static void +chfs_reinit(void) +{ + chfs_ihashreinit(); + ufs_reinit(); +} + +/* --------------------------------------------------------------------- */ + +static void +chfs_done(void) +{ + ufs_done(); + chfs_ihashdone(); + pool_destroy(&chfs_inode_pool); + chfs_destroy_pool_caches(); +} + +/* --------------------------------------------------------------------- */ + +static int +chfs_snapshot(struct mount *mp, struct vnode *vp, + struct timespec *ctime) +{ + return ENODEV; +} + +/* --------------------------------------------------------------------- */ + +/* + * chfs vfs operations. + */ + +extern const struct vnodeopv_desc chfs_fifoop_opv_desc; +extern const struct vnodeopv_desc chfs_specop_opv_desc; +extern const struct vnodeopv_desc chfs_vnodeop_opv_desc; + +const struct vnodeopv_desc * const chfs_vnodeopv_descs[] = { + &chfs_fifoop_opv_desc, + &chfs_specop_opv_desc, + &chfs_vnodeop_opv_desc, + NULL, +}; + +struct vfsops chfs_vfsops = { + MOUNT_CHFS, /* vfs_name */ + sizeof (struct chfs_args), + chfs_mount, /* vfs_mount */ + chfs_start, /* vfs_start */ + chfs_unmount, /* vfs_unmount */ + chfs_root, /* vfs_root */ + ufs_quotactl, /* vfs_quotactl */ + chfs_statvfs, /* vfs_statvfs */ + chfs_sync, /* vfs_sync */ + chfs_vget, /* vfs_vget */ + chfs_fhtovp, /* vfs_fhtovp */ + chfs_vptofh, /* vfs_vptofh */ + chfs_init, /* vfs_init */ + chfs_reinit, /* vfs_reinit */ + chfs_done, /* vfs_done */ + NULL, /* vfs_mountroot */ + chfs_snapshot, /* vfs_snapshot */ + vfs_stdextattrctl, /* vfs_extattrctl */ + (void *)eopnotsupp, /* vfs_suspendctl */ + genfs_renamelock_enter, + genfs_renamelock_exit, + (void *)eopnotsupp, + chfs_vnodeopv_descs, + 0, /* vfs_refcount */ + { NULL, NULL }, +}; + +static int +chfs_modcmd(modcmd_t cmd, void *arg) +{ + switch (cmd) { + case MODULE_CMD_INIT: + return vfs_attach(&chfs_vfsops); + case MODULE_CMD_FINI: + return vfs_detach(&chfs_vfsops); + default: + return ENOTTY; + } +} diff --git a/sys/ufs/chfs/chfs_vnode.c b/sys/ufs/chfs/chfs_vnode.c new file mode 100644 index 000000000..2e1b386bd --- /dev/null +++ b/sys/ufs/chfs/chfs_vnode.c @@ -0,0 +1,393 @@ +/* $NetBSD: chfs_vnode.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * Copyright (C) 2010 Tamas Toth + * Copyright (C) 2010 Adam Hoka + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "chfs.h" +#include "chfs_inode.h" +#include +#include +#include +#include +#include + +struct vnode * +chfs_vnode_lookup(struct chfs_mount *chmp, ino_t vno) +{ + struct vnode *vp; + struct chfs_inode *ip; + + TAILQ_FOREACH(vp, &chmp->chm_fsmp->mnt_vnodelist, v_mntvnodes) { + ip = VTOI(vp); + if (ip && ip->ino == vno) + return vp; + } + return NULL; +} + +int +chfs_readvnode(struct mount* mp, ino_t ino, struct vnode** vpp) +{ + struct ufsmount* ump = VFSTOUFS(mp); + struct chfs_mount *chmp = ump->um_chfs; + struct chfs_vnode_cache *chvc; + struct chfs_flash_vnode *chfvn; + struct chfs_inode *ip; + int err; + char* buf; + size_t retlen, len; + struct vnode* vp = NULL; + dbg("readvnode | ino: %llu\n", (unsigned long long)ino); + + len = sizeof(struct chfs_flash_vnode); + + KASSERT(vpp != NULL); + + if (vpp != NULL) { + vp = *vpp; + } + + ip = VTOI(vp); + chvc = ip->chvc; + + if (chvc && ino != CHFS_ROOTINO) { + /* debug... */ + printf("readvnode; offset: %" PRIu32 ", lnr: %d\n", + CHFS_GET_OFS(chvc->v->nref_offset), chvc->v->nref_lnr); + + KASSERT((void *)chvc != (void *)chvc->v); + + buf = kmem_alloc(len, KM_SLEEP); + err = chfs_read_leb(chmp, chvc->v->nref_lnr, buf, + CHFS_GET_OFS(chvc->v->nref_offset), len, &retlen); + if (err) + return err; + if (retlen != len) { + chfs_err("Error reading vnode: read: %zu insted of: %zu\n", + len, retlen); + return EIO; + } + chfvn = (struct chfs_flash_vnode*)buf; + chfs_set_vnode_size(vp, chfvn->dn_size); + ip->mode = chfvn->mode; + vp->v_type = IFTOVT(ip->mode); + ip->version = chfvn->version; + //ip->chvc->highest_version = ip->version; + ip->uid = chfvn->uid; + ip->gid = chfvn->gid; + ip->atime = chfvn->atime; + ip->mtime = chfvn->mtime; + ip->ctime = chfvn->ctime; + kmem_free(buf, len); + } + + + *vpp = vp; + return 0; +} + +int +chfs_readdirent(struct mount *mp, struct chfs_node_ref *chnr, struct chfs_inode *pdir) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct chfs_mount *chmp = ump->um_chfs; + struct chfs_flash_dirent_node chfdn; + struct chfs_dirent *fd;//, *pdents; + size_t len = sizeof(struct chfs_flash_dirent_node); +// struct chfs_vnode_cache* parent; + size_t retlen; + int err = 0; + +// parent = chfs_get_vnode_cache(chmp, pdir->ino); + + //read flash_dirent_node + err = chfs_read_leb(chmp, chnr->nref_lnr, (char *)&chfdn, + CHFS_GET_OFS(chnr->nref_offset), len, &retlen); + if (err) { + return err; + } + if (retlen != len) { + chfs_err("Error reading vnode: read: %zu insted of: %zu\n", + retlen, len); + return EIO; + } + + //set fields of dirent + fd = chfs_alloc_dirent(chfdn.nsize + 1); + fd->version = chfdn.version; + fd->vno = chfdn.vno; + fd->type = chfdn.dtype; + fd->nsize = chfdn.nsize; +// fd->next = NULL; + + err = chfs_read_leb(chmp, chnr->nref_lnr, fd->name, + CHFS_GET_OFS(chnr->nref_offset) + len, chfdn.nsize, &retlen); + if (err) { + return err; + } + + if (retlen != chfdn.nsize) { + chfs_err("Error reading vnode: read: %zu insted of: %zu\n", + len, retlen); + return EIO; + } + + fd->name[fd->nsize] = 0; + fd->nref = chnr; + + chfs_add_fd_to_inode(chmp, pdir, fd); +/* + pdents = pdir->i_chfs_ext.dents; + if (!pdents) + pdir->i_chfs_ext.dents = fd; + else { + while (pdents->next != NULL) { + pdents = pdents->next; + } + pdents->next = fd; + } +*/ + return 0; +} + +/* + * Allocate a new inode. + */ +int +chfs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp, + struct componentname *cnp, int type) +{ + struct chfs_inode *ip, *pdir; + struct vnode *vp; + struct ufsmount* ump = VFSTOUFS(dvp->v_mount); + struct chfs_mount* chmp = ump->um_chfs; + struct chfs_vnode_cache* chvc; + int error, ismember = 0; + ino_t vno; + struct chfs_dirent *nfd;//, *fd; + + dbg("makeinode\n"); + pdir = VTOI(dvp); + + *vpp = NULL; + + vno = ++(chmp->chm_max_vno); + + error = VFS_VGET(dvp->v_mount, vno, &vp); + if (error) + return (error); + + mutex_enter(&chmp->chm_lock_vnocache); + chvc = chfs_vnode_cache_get(chmp, vno); + mutex_exit(&chmp->chm_lock_vnocache); + + chvc->pvno = pdir->ino; + chvc->vno_version = kmem_alloc(sizeof(uint64_t), KM_SLEEP); + *(chvc->vno_version) = 1; + if (type != VDIR) + chvc->nlink = 1; + else + chvc->nlink = 2; +// chfs_vnode_cache_set_state(chmp, chvc, VNO_STATE_CHECKEDABSENT); + chvc->state = VNO_STATE_CHECKEDABSENT; + + ip = VTOI(vp); + ip->ino = vno; + + if (type == VDIR) + chfs_set_vnode_size(vp, 512); + else + chfs_set_vnode_size(vp, 0); + + ip->uid = kauth_cred_geteuid(cnp->cn_cred); + ip->gid = kauth_cred_getegid(cnp->cn_cred); + ip->version = 1; + ip->iflag |= (IN_ACCESS | IN_CHANGE | IN_UPDATE); + + ip->chvc = chvc; + //ip->chvc->highest_version = 1; + ip->target = NULL; + + ip->mode = mode; + vp->v_type = type; /* Rest init'd in getnewvnode(). */ + if ((ip->mode & ISGID) && (kauth_cred_ismember_gid(cnp->cn_cred, + ip->gid, &ismember) != 0 || !ismember) && + kauth_authorize_generic(cnp->cn_cred, KAUTH_GENERIC_ISSUSER, NULL)) + ip->mode &= ~ISGID; + + chfs_update(vp, NULL, NULL, UPDATE_WAIT); + + mutex_enter(&chmp->chm_lock_mountfields); + + //write inode to flash + error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL); + if (error) { + mutex_exit(&chmp->chm_lock_mountfields); + vput(vp); + vput(dvp); + return error; + } + //update parent directory and write it to the flash + pdir->iflag |= (IN_ACCESS | IN_CHANGE | IN_MODIFY | IN_UPDATE); + chfs_update(dvp, NULL, NULL, UPDATE_WAIT); + + error = chfs_write_flash_vnode(chmp, pdir, ALLOC_NORMAL); + if (error) { + mutex_exit(&chmp->chm_lock_mountfields); + vput(vp); + vput(dvp); + return error; + } + vput(dvp); + + //set up node's full dirent + nfd = chfs_alloc_dirent(cnp->cn_namelen + 1); + nfd->vno = ip->ino; + nfd->version = (++pdir->chvc->highest_version); + nfd->type = type; +// nfd->next = NULL; + nfd->nsize = cnp->cn_namelen; + memcpy(&(nfd->name), cnp->cn_nameptr, cnp->cn_namelen); + nfd->name[nfd->nsize] = 0; + nfd->nhash = hash32_buf(nfd->name, cnp->cn_namelen, HASH32_BUF_INIT); + + // write out direntry + error = chfs_write_flash_dirent(chmp, pdir, ip, nfd, ip->ino, ALLOC_NORMAL); + if (error) { + mutex_exit(&chmp->chm_lock_mountfields); + vput(vp); + return error; + } + + //TODO set parent's dir times + + chfs_add_fd_to_inode(chmp, pdir, nfd); +/* + fd = pdir->i_chfs_ext.dents; + if (!fd) + pdir->i_chfs_ext.dents = nfd; + else { + while (fd->next != NULL) { + fd = fd->next; + } + fd->next = nfd; + } +*/ + //pdir->i_nlink++; + pdir->chvc->nlink++; + + mutex_exit(&chmp->chm_lock_mountfields); + + *vpp = vp; + return (0); +} + +void +chfs_set_vnode_size(struct vnode *vp, size_t size) +{ + struct chfs_inode *ip; + + KASSERT(vp != NULL); + + ip = VTOI(vp); + KASSERT(ip != NULL); + + ip->size = size; + vp->v_size = vp->v_writesize = size; + return; +} + +void +chfs_change_size_free(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb, int change) +{ + KASSERT(mutex_owned(&chmp->chm_lock_sizes)); + KASSERT((int)(chmp->chm_free_size + change) >= 0); + KASSERT((int)(cheb->free_size + change) >= 0); + KASSERT((int)(cheb->free_size + change) <= chmp->chm_ebh->eb_size); + chmp->chm_free_size += change; + cheb->free_size += change; + return; +} + +void +chfs_change_size_dirty(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb, int change) +{ + KASSERT(mutex_owned(&chmp->chm_lock_sizes)); + KASSERT((int)(chmp->chm_dirty_size + change) >= 0); + KASSERT((int)(cheb->dirty_size + change) >= 0); + KASSERT((int)(cheb->dirty_size + change) <= chmp->chm_ebh->eb_size); + chmp->chm_dirty_size += change; + cheb->dirty_size += change; + return; +} + +void +chfs_change_size_unchecked(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb, int change) +{ + KASSERT(mutex_owned(&chmp->chm_lock_sizes)); + KASSERT((int)(chmp->chm_unchecked_size + change) >= 0); + KASSERT((int)(cheb->unchecked_size + change) >= 0); + KASSERT((int)(cheb->unchecked_size + change) <= chmp->chm_ebh->eb_size); + chmp->chm_unchecked_size += change; + cheb->unchecked_size += change; + return; +} + +void +chfs_change_size_used(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb, int change) +{ + KASSERT(mutex_owned(&chmp->chm_lock_sizes)); + KASSERT((int)(chmp->chm_used_size + change) >= 0); + KASSERT((int)(cheb->used_size + change) >= 0); + KASSERT((int)(cheb->used_size + change) <= chmp->chm_ebh->eb_size); + chmp->chm_used_size += change; + cheb->used_size += change; + return; +} + +void +chfs_change_size_wasted(struct chfs_mount *chmp, + struct chfs_eraseblock *cheb, int change) +{ + KASSERT(mutex_owned(&chmp->chm_lock_sizes)); + KASSERT((int)(chmp->chm_wasted_size + change) >= 0); + KASSERT((int)(cheb->wasted_size + change) >= 0); + KASSERT((int)(cheb->wasted_size + change) <= chmp->chm_ebh->eb_size); + chmp->chm_wasted_size += change; + cheb->wasted_size += change; + return; +} + diff --git a/sys/ufs/chfs/chfs_vnode_cache.c b/sys/ufs/chfs/chfs_vnode_cache.c new file mode 100644 index 000000000..101b49402 --- /dev/null +++ b/sys/ufs/chfs/chfs_vnode_cache.c @@ -0,0 +1,165 @@ +/* $NetBSD: chfs_vnode_cache.c,v 1.1 2011/11/24 15:51:32 ahoka Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * Copyright (C) 2010 Tamas Toth + * Copyright (C) 2010 Adam Hoka + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "chfs.h" +#include + +struct chfs_vnode_cache ** +chfs_vnocache_hash_init(void) +{ + return kmem_zalloc(VNODECACHE_SIZE * + sizeof(struct chfs_vnode_cache *), KM_SLEEP); +} + +/** + * chfs_set_vnode_cache_state - set state of a vnode_cache + * @chmp: fs super block info + * @vc: vnode_cache + * @state: new state + */ +void +chfs_vnode_cache_set_state(struct chfs_mount *chmp, + struct chfs_vnode_cache* vc, int state) +{ + /* XXX do we really need locking here? */ + KASSERT(mutex_owned(&chmp->chm_lock_vnocache)); + vc->state = state; +} + +/** + * chfs_get_vnode_cache - get a vnode_cache from the vnocache_hash + * @chmp: fs super block info + * @ino: inode for search + * Returns the vnode_cache. + */ +struct chfs_vnode_cache * +chfs_vnode_cache_get(struct chfs_mount *chmp, ino_t vno) +{ + struct chfs_vnode_cache* ret; + + KASSERT(mutex_owned(&chmp->chm_lock_vnocache)); + + ret = chmp->chm_vnocache_hash[vno % VNODECACHE_SIZE]; + + if (ret == NULL) { + return NULL; + } + + while (ret && ret->vno < vno) { + ret = ret->next; + } + + if (ret && ret->vno != vno) { + ret = NULL; + } + + return ret; +} + +/** + * chfs_add_vnode_cache - add a vnode_cache to the vnocache_hash + * @chmp: fs super block info + * @new: new vnode_cache + */ +void +chfs_vnode_cache_add(struct chfs_mount *chmp, + struct chfs_vnode_cache* new) +{ + struct chfs_vnode_cache** prev; + + KASSERT(mutex_owned(&chmp->chm_lock_vnocache)); + + if (!new->vno) { + new->vno = ++chmp->chm_max_vno; + } + + prev = &chmp->chm_vnocache_hash[new->vno % VNODECACHE_SIZE]; + + while ((*prev) && (*prev)->vno < new->vno) { + prev = &((*prev)->next); + } + new->next = *prev; + *prev = new; +} + +/** + * chfs_del_vnode_cache - del a vnode_cache from the vnocache_hash + * @chmp: fs super block info + * @old: old vnode_cache + */ +void +chfs_vnode_cache_remove(struct chfs_mount *chmp, + struct chfs_vnode_cache* old) +{ + struct chfs_vnode_cache** prev; + + KASSERT(mutex_owned(&chmp->chm_lock_vnocache)); + + prev = &chmp->chm_vnocache_hash[old->vno % VNODECACHE_SIZE]; + while ((*prev) && (*prev)->vno < old->vno) { + prev = &(*prev)->next; + } + + if ((*prev) == old) { + *prev = old->next; + } + + if (old->state != VNO_STATE_READING && + old->state != VNO_STATE_CLEARING) { + chfs_vnode_cache_free(old); + } +} + +/** + * chfs_free_vnode_caches - free the vnocache_hash + * @chmp: fs super block info + */ +void +chfs_vnocache_hash_destroy(struct chfs_vnode_cache **hash) +{ + struct chfs_vnode_cache *this, *next; + int i; + + for (i = 0; i < VNODECACHE_SIZE; i++) { + this = hash[i]; + while (this) { + next = this->next; + chfs_vnode_cache_free(this); + this = next; + } + hash[i] = NULL; + } +} + + diff --git a/sys/ufs/chfs/chfs_vnops.c b/sys/ufs/chfs/chfs_vnops.c new file mode 100644 index 000000000..f6a11d93b --- /dev/null +++ b/sys/ufs/chfs/chfs_vnops.c @@ -0,0 +1,1765 @@ +/* $NetBSD: chfs_vnops.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * Copyright (C) 2010 Tamas Toth + * Copyright (C) 2010 Adam Hoka + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "chfs.h" + +#define READ_S "chfs_read" + +int +chfs_lookup(void *v) +{ + struct vnode *dvp = ((struct vop_lookup_args *) v)->a_dvp; + struct vnode **vpp = ((struct vop_lookup_args *) v)->a_vpp; + struct componentname *cnp = ((struct vop_lookup_args *) v)->a_cnp; + + int error; + struct chfs_inode* ip; + struct ufsmount* ump; + struct chfs_mount* chmp; + struct chfs_vnode_cache* chvc; + struct chfs_dirent* fd; + + dbg("lookup(): %s\n", cnp->cn_nameptr); + + KASSERT(VOP_ISLOCKED(dvp)); + + *vpp = NULL; + + // Check accessibility of requested node as a first step. + error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred); + if (error != 0) { + goto out; + } + + // If requesting the last path component on a read-only file system + // with a write operation, deny it. + if ((cnp->cn_flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) + && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { + error = EROFS; + goto out; + } + + // Avoid doing a linear scan of the directory if the requested + // directory/name couple is already in the cache. + error = cache_lookup(dvp, vpp, cnp); + if (error >= 0) { + goto out; + } + + ip = VTOI(dvp); + ump = VFSTOUFS(dvp->v_mount); + chmp = ump->um_chfs; + if (ip->ino == 0) { + ip->ino = ++chmp->chm_max_vno; + } + mutex_enter(&chmp->chm_lock_vnocache); + chvc = chfs_vnode_cache_get(chmp, ip->ino); + mutex_exit(&chmp->chm_lock_vnocache); + + // We cannot be requesting the parent directory of the root node. + KASSERT(IMPLIES(dvp->v_type == VDIR && chvc->pvno == chvc->vno, + !(cnp->cn_flags & ISDOTDOT))); + + if (cnp->cn_flags & ISDOTDOT) { + VOP_UNLOCK(dvp); + error = VFS_VGET(dvp->v_mount, ip->chvc->pvno, vpp); + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); + } else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { + vref(dvp); + *vpp = dvp; + error = 0; + } else { + fd = chfs_dir_lookup(ip, cnp); + + if (fd == NULL) { + dbg("fd null\n"); + // The entry was not found in the directory. + // This is OK if we are creating or renaming an + // entry and are working on the last component of + // the path name. + if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == CREATE + || cnp->cn_nameiop == RENAME)) { + error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred); + if (error) { + dbg("after the entry was not found in dir\n"); + goto out; + } + + dbg("return EJUSTRETURN\n"); + error = EJUSTRETURN; + } else { + error = ENOENT; + } + } else { + // If we are not at the last path component and + // found a non-directory or non-link entry (which + // may itself be pointing to a directory), raise + // an error. + if ((fd->type != VDIR && fd->type != VLNK) && !(cnp->cn_flags + & ISLASTCN)) { + error = ENOTDIR; + goto out; + } + + dbg("vno@allocating new vnode: %llu\n", + (unsigned long long)fd->vno); + error = VFS_VGET(dvp->v_mount, fd->vno, vpp); + } + } + // Store the result of this lookup in the cache. Avoid this if the + // request was for creation, as it does not improve timings on + // emprical tests. + if ((cnp->cn_flags & MAKEENTRY) && cnp->cn_nameiop != CREATE + && (cnp->cn_flags & ISDOTDOT) == 0) + cache_enter(dvp, *vpp, cnp); + +out: + // If there were no errors, *vpp cannot be null and it must be + // locked. + KASSERT(IFF(error == 0, *vpp != NULL && VOP_ISLOCKED(*vpp))); + + // dvp must always be locked. + KASSERT(VOP_ISLOCKED(dvp)); + + return error; +} + +/* --------------------------------------------------------------------- */ + +int +chfs_create(void *v) +{ + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */*ap = v; + int error, mode; + dbg("create()\n"); + + mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode); + + if ((mode & IFMT) == 0) { + if (ap->a_vap->va_type == VREG) + mode |= IFREG; + if (ap->a_vap->va_type == VSOCK) + mode |= IFSOCK; + } + + error = chfs_makeinode(mode, ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap->va_type); + + if (error) { + dbg("error: %d\n", error); + return error; + } + + VN_KNOTE(ap->a_dvp, NOTE_WRITE); + return 0; +} +/* --------------------------------------------------------------------- */ + +int +chfs_mknod(void *v) +{ + struct vnode *dvp = ((struct vop_mknod_args *) v)->a_dvp; + struct vnode **vpp = ((struct vop_mknod_args *) v)->a_vpp; + struct componentname *cnp = ((struct vop_mknod_args *) v)->a_cnp; + struct vattr *vap = ((struct vop_mknod_args *) v)->a_vap; + int mode, err = 0; + struct chfs_inode *ip; + struct vnode *vp; + + struct ufsmount *ump; + struct chfs_mount *chmp; + ino_t ino; + + struct chfs_full_dnode *fd; + struct buf *bp; + int len; + dbg("mknod()\n"); + + ump = VFSTOUFS(dvp->v_mount); + chmp = ump->um_chfs; + + if (vap->va_type != VBLK && vap->va_type != VCHR && vap->va_type != VFIFO) + return EINVAL; + + vp = *vpp; + + mode = MAKEIMODE(vap->va_type, vap->va_mode); + + if ((mode & IFMT) == 0) { + switch (vap->va_type) { + case VBLK: + mode |= IFBLK; + break; + case VCHR: + mode |= IFCHR; + break; + case VFIFO: + mode |= IFIFO; + break; + default: + break; + } + } + + err = chfs_makeinode(mode, dvp, &vp, cnp, vap->va_type); + + ip = VTOI(vp); + ino = ip->ino; + if (vap->va_rdev != VNOVAL) + ip->rdev = vap->va_rdev; + + if (vap->va_type == VFIFO) + vp->v_op = chfs_fifoop_p; + else { + vp->v_op = chfs_specop_p; + spec_node_init(vp, ip->rdev); + } + + if (err) + return err; + + len = sizeof(dev_t); + chfs_set_vnode_size(vp, len); + bp = getiobuf(vp, true); + bp->b_bufsize = bp->b_resid = len; + bp->b_data = kmem_alloc(len, KM_SLEEP); + memcpy(bp->b_data, &ip->rdev, len); + bp->b_blkno = 0; + + fd = chfs_alloc_full_dnode(); + + mutex_enter(&chmp->chm_lock_mountfields); + + err = chfs_write_flash_dnode(chmp, vp, bp, fd); + if (err) { + mutex_exit(&chmp->chm_lock_mountfields); + kmem_free(bp->b_data, len); + return err; + } + + err = chfs_add_full_dnode_to_inode(chmp, ip, fd); + if (err) { + mutex_exit(&chmp->chm_lock_mountfields); + kmem_free(bp->b_data, len); + return err; + } + + mutex_exit(&chmp->chm_lock_mountfields); + + *vpp = vp; + kmem_free(bp->b_data, len); + putiobuf(bp); + + return 0; +} + +/* --------------------------------------------------------------------- */ + +int +chfs_open(void *v) +{ + struct vnode *vp = ((struct vop_open_args *) v)->a_vp; + int mode = ((struct vop_open_args *) v)->a_mode; + dbg("open()\n"); + + int error; + struct chfs_inode *ip; + + KASSERT(VOP_ISLOCKED(vp)); + + ip = VTOI(vp); + + KASSERT(vp->v_size == ip->size); + if (ip->chvc->nlink < 1) { + error = ENOENT; + goto out; + } + + // If the file is marked append-only, deny write requests. + if (ip->flags & APPEND && (mode & (FWRITE | O_APPEND)) == FWRITE) + error = EPERM; + else + error = 0; + +out: + KASSERT(VOP_ISLOCKED(vp)); + return error; +} + +/* --------------------------------------------------------------------- */ + +int +chfs_close(void *v) +{ + struct vnode *vp = ((struct vop_close_args *) v)->a_vp; + dbg("close()\n"); + + struct chfs_inode *ip; + + KASSERT(VOP_ISLOCKED(vp)); + + ip = VTOI(vp); + + if (ip->chvc->nlink > 0) { + //ip->chvc->nlink = 0; + chfs_update(vp, NULL, NULL, UPDATE_CLOSE); + } + + return 0; +} + +/* --------------------------------------------------------------------- */ + +int +chfs_access(void *v) +{ + struct vnode *vp = ((struct vop_access_args *) v)->a_vp; + int mode = ((struct vop_access_args *) v)->a_mode; + kauth_cred_t cred = ((struct vop_access_args *) v)->a_cred; + + dbg("access()\n"); + struct chfs_inode *ip = VTOI(vp); + + if (mode & VWRITE) { + switch (vp->v_type) { + case VLNK: + case VDIR: + case VREG: + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + break; + case VBLK: + case VCHR: + case VSOCK: + case VFIFO: + break; + default: + break; + } + } + + if (mode & VWRITE && ip->flags & IMMUTABLE) + return (EPERM); + + return genfs_can_access(vp->v_type, ip->mode & ALLPERMS, + ip->uid, ip->gid, mode, cred); +} + +/* --------------------------------------------------------------------- */ + +int +chfs_getattr(void *v) +{ + struct vnode *vp = ((struct vop_getattr_args *) v)->a_vp; + struct vattr *vap = ((struct vop_getattr_args *) v)->a_vap; + + struct chfs_inode *ip = VTOI(vp); + dbg("getattr()\n"); + + KASSERT(vp->v_size == ip->size); + + vattr_null(vap); + CHFS_ITIMES(ip, NULL, NULL, NULL); + + vap->va_type = vp->v_type; + vap->va_mode = ip->mode & ALLPERMS; + vap->va_nlink = ip->chvc->nlink; + vap->va_uid = ip->uid; + vap->va_gid = ip->gid; + vap->va_fsid = ip->dev; + vap->va_fileid = ip->ino; + vap->va_size = ip->size; + vap->va_blocksize = PAGE_SIZE; + vap->va_atime.tv_sec = ip->atime; + vap->va_atime.tv_nsec = 0; + vap->va_mtime.tv_sec = ip->mtime; + vap->va_mtime.tv_nsec = 0; + vap->va_ctime.tv_sec = ip->ctime; + vap->va_ctime.tv_nsec = 0; + vap->va_gen = ip->version; + vap->va_flags = ip->flags; + vap->va_rdev = ip->rdev; + vap->va_bytes = round_page(ip->size); + vap->va_filerev = VNOVAL; + vap->va_vaflags = 0; + vap->va_spare = VNOVAL; + + return 0; +} + +/* --------------------------------------------------------------------- */ + +/* Note: modelled after tmpfs's same function */ + +int +chfs_setattr(void *v) +{ + struct vnode *vp = ((struct vop_setattr_args *) v)->a_vp; + struct vattr *vap = ((struct vop_setattr_args *) v)->a_vap; + kauth_cred_t cred = ((struct vop_setattr_args *) v)->a_cred; + + struct chfs_inode *ip; + struct ufsmount *ump = VFSTOUFS(vp->v_mount); + struct chfs_mount *chmp = ump->um_chfs; + int error = 0; + + dbg("setattr()\n"); + + KASSERT(VOP_ISLOCKED(vp)); + ip = VTOI(vp); + + /* Abort if any unsettable attribute is given. */ + if (vap->va_type != VNON || vap->va_nlink != VNOVAL || + vap->va_fsid != VNOVAL || vap->va_fileid != VNOVAL || + vap->va_blocksize != VNOVAL /*|| GOODTIME(&vap->va_ctime)*/ || + vap->va_gen != VNOVAL || vap->va_rdev != VNOVAL || + vap->va_bytes != VNOVAL) { + return EINVAL; + } + + if (error == 0 && (vap->va_flags != VNOVAL)) + error = chfs_chflags(vp, vap->va_flags, cred); + + if (error == 0 && (vap->va_size != VNOVAL)) + error = chfs_chsize(vp, vap->va_size, cred); + + if (error == 0 && (vap->va_uid != VNOVAL || vap->va_gid != VNOVAL)) + error = chfs_chown(vp, vap->va_uid, vap->va_gid, cred); + + if (error == 0 && (vap->va_mode != VNOVAL)) + error = chfs_chmod(vp, vap->va_mode, cred); + +#if 0 + /* why do we need that? */ + if (ip->flags & (IMMUTABLE | APPEND)) + return EPERM; +#endif + + if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { + error = genfs_can_chtimes(vp, vap->va_vaflags, ip->uid, cred); + if (error) + return error; + if (vap->va_atime.tv_sec != VNOVAL) + ip->iflag |= IN_ACCESS; + if (vap->va_mtime.tv_sec != VNOVAL) + ip->iflag |= IN_CHANGE | IN_UPDATE; + error = chfs_update(vp, + &vap->va_atime, &vap->va_mtime, UPDATE_WAIT); + if (error) + return error; + } + + mutex_enter(&chmp->chm_lock_mountfields); + error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL); + mutex_exit(&chmp->chm_lock_mountfields); + + return error; +} + +int +chfs_chmod(struct vnode *vp, int mode, kauth_cred_t cred) +{ + struct chfs_inode *ip = VTOI(vp); + int error; + dbg("chmod\n"); + + error = genfs_can_chmod(vp, cred, ip->uid, ip->gid, mode); + if (error) + return error; + ip->mode &= ~ALLPERMS; + ip->mode |= (mode & ALLPERMS); + ip->iflag |= IN_CHANGE; + + error = chfs_update(vp, NULL, NULL, UPDATE_WAIT); + if (error) + return error; + + return 0; +} + +int +chfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred) +{ + struct chfs_inode *ip = VTOI(vp); + int error; + dbg("chown\n"); + + if (uid == (uid_t)VNOVAL) + uid = ip->uid; + if (gid == (gid_t)VNOVAL) + gid = ip->gid; + + error = genfs_can_chown(vp, cred, ip->uid, ip->gid, uid, gid); + if (error) + return error; + + ip->gid = gid; + ip->uid = uid; + ip->iflag |= IN_CHANGE; + + error = chfs_update(vp, NULL, NULL, UPDATE_WAIT); + if (error) + return error; + + return 0; +} + + +/* --------------------------------------------------------------------- */ +/* calculates ((off_t)blk * chmp->chm_chm_fs_bsize) */ +#define lblktosize(chmp, blk) \ + (((off_t)(blk)) << (chmp)->chm_fs_bshift) + +/* calculates (loc % chmp->chm_chm_fs_bsize) */ +#define blkoff(chmp, loc) \ + ((loc) & (chmp)->chm_fs_qbmask) + +/* calculates (loc / chmp->chm_chm_fs_bsize) */ +#define lblkno(chmp, loc) \ + ((loc) >> (chmp)->chm_fs_bshift) + +/* calculates roundup(size, chmp->chm_chm_fs_fsize) */ +#define fragroundup(chmp, size) \ + (((size) + (chmp)->chm_fs_qfmask) & (chmp)->chm_fs_fmask) + +#define blksize(chmp, ip, lbn) \ + (((lbn) >= NDADDR || (ip)->size >= lblktosize(chmp, (lbn) + 1)) \ + ? (chmp)->chm_fs_bsize \ + : (fragroundup(chmp, blkoff(chmp, (ip)->size)))) + +/* calculates roundup(size, chmp->chm_chm_fs_bsize) */ +#define blkroundup(chmp, size) \ + (((size) + (chmp)->chm_fs_qbmask) & (chmp)->chm_fs_bmask) + +int +chfs_read(void *v) +{ + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp; + struct chfs_inode *ip; + struct uio *uio; + struct ufsmount *ump; + struct buf *bp; + struct chfs_mount *chmp; + daddr_t lbn, nextlbn; + off_t bytesinfile; + long size, xfersize, blkoffset; + int error, ioflag; + vsize_t bytelen; + bool usepc = false; + + dbg("chfs_read\n"); + + vp = ap->a_vp; + ip = VTOI(vp); + ump = ip->ump; + uio = ap->a_uio; + ioflag = ap->a_ioflag; + error = 0; + + dbg("ip->size:%llu\n", (unsigned long long)ip->size); + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ) + panic("%s: mode", READ_S); + + if (vp->v_type == VLNK) { + if (ip->size < ump->um_maxsymlinklen) + panic("%s: short symlink", READ_S); + } else if (vp->v_type != VREG && vp->v_type != VDIR) + panic("%s: type %d", READ_S, vp->v_type); +#endif + chmp = ip->chmp; + if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize) + return (EFBIG); + if (uio->uio_resid == 0) + return (0); + + fstrans_start(vp->v_mount, FSTRANS_SHARED); + + if (uio->uio_offset >= ip->size) + goto out; + + usepc = vp->v_type == VREG; + bytelen = 0; + if (usepc) { + const int advice = IO_ADV_DECODE(ap->a_ioflag); + + while (uio->uio_resid > 0) { + if (ioflag & IO_DIRECT) { + genfs_directio(vp, uio, ioflag); + } + bytelen = MIN(ip->size - uio->uio_offset, + uio->uio_resid); + if (bytelen == 0) + break; + error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice, + UBC_READ | UBC_PARTIALOK | + (UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0)); + if (error) + break; + + } + goto out; + } + + + dbg("start reading\n"); + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { + bytesinfile = ip->size - uio->uio_offset; + if (bytesinfile <= 0) + break; + lbn = lblkno(chmp, uio->uio_offset); + nextlbn = lbn + 1; + size = blksize(chmp, ip, lbn); + blkoffset = blkoff(chmp, uio->uio_offset); + xfersize = MIN(MIN(chmp->chm_fs_bsize - blkoffset, uio->uio_resid), + bytesinfile); + + if (lblktosize(chmp, nextlbn) >= ip->size) { + error = bread(vp, lbn, size, NOCRED, 0, &bp); + dbg("after bread\n"); + } else { + int nextsize = blksize(chmp, ip, nextlbn); + dbg("size: %ld\n", size); + error = breadn(vp, lbn, + size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp); + dbg("after breadN\n"); + } + if (error) + break; + + /* + * We should only get non-zero b_resid when an I/O error + * has occurred, which should cause us to break above. + * However, if the short read did not cause an error, + * then we want to ensure that we do not uiomove bad + * or uninitialized data. + */ + size -= bp->b_resid; + if (size < xfersize) { + if (size == 0) + break; + xfersize = size; + } + dbg("uiomove\n"); + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); + if (error) + break; + brelse(bp, 0); + } + if (bp != NULL) + brelse(bp, 0); + +out: + if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) { + ip->iflag |= IN_ACCESS; + if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) { + //error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) { + fstrans_done(vp->v_mount); + return error; + } + error = chfs_update(vp, NULL, NULL, UPDATE_WAIT); + //UFS_WAPBL_END(vp->v_mount); + } + } + + dbg("[END]\n"); + fstrans_done(vp->v_mount); + return (error); +} + + +/* --------------------------------------------------------------------- */ + +/*from ffs write*/ +int +chfs_write(void *v) +{ + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp ; + struct uio *uio; + struct chfs_inode *ip; + struct chfs_mount *chmp; + struct lwp *l; + kauth_cred_t cred; + off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize; + int blkoffset, error, flags, ioflag, resid; + int aflag; + int extended=0; + vsize_t bytelen; + bool async; + struct ufsmount *ump; + + + cred = ap->a_cred; + ioflag = ap->a_ioflag; + uio = ap->a_uio; + vp = ap->a_vp; + ip = VTOI(vp); + //dbg("file size (vp): %llu\n", (unsigned long long)vp->v_size); + //dbg("file size (ip): %llu\n", (unsigned long long)ip->i_size); + ump = ip->ump; + + //dbg("uio->resid: %d\n", uio->uio_resid); + dbg("write\n"); + + KASSERT(vp->v_size == ip->size); + + switch (vp->v_type) { + case VREG: + if (ioflag & IO_APPEND) + uio->uio_offset = ip->size; + if ((ip->flags & APPEND) && uio->uio_offset != ip->size) + return (EPERM); + /* FALLTHROUGH */ + case VLNK: + break; + case VDIR: + if ((ioflag & IO_SYNC) == 0) + panic("chfs_write: nonsync dir write"); + break; + default: + panic("chfs_write: type"); + } + + chmp = ip->chmp; + if (uio->uio_offset < 0 || + (u_int64_t)uio->uio_offset + + uio->uio_resid > ump->um_maxfilesize) { + dbg("uio->uio_offset = %lld | uio->uio_offset + " + "uio->uio_resid (%llu) > ump->um_maxfilesize (%lld)\n", + (long long)uio->uio_offset, + (uint64_t)uio->uio_offset + uio->uio_resid, + (long long)ump->um_maxfilesize); + return (EFBIG); + } + /* + * Maybe this should be above the vnode op call, but so long as + * file servers have no limits, I don't think it matters. + */ + l = curlwp; + if (vp->v_type == VREG && l && + uio->uio_offset + uio->uio_resid > + l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { + mutex_enter(proc_lock); + psignal(l->l_proc, SIGXFSZ); + mutex_exit(proc_lock); + return (EFBIG); + } + if (uio->uio_resid == 0) + return (0); + + //mutex_enter(&ip->inode_lock); + fstrans_start(vp->v_mount, FSTRANS_SHARED); + + flags = ioflag & IO_SYNC ? B_SYNC : 0; + async = vp->v_mount->mnt_flag & MNT_ASYNC; + origoff = uio->uio_offset; + resid = uio->uio_resid; + osize = ip->size; + error = 0; + + + /*if ((ioflag & IO_JOURNALLOCKED) == 0) { + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) { + fstrans_done(vp->v_mount); + return error; + } + }*/ + + preallocoff = round_page(blkroundup(chmp, + MAX(osize, uio->uio_offset))); + aflag = ioflag & IO_SYNC ? B_SYNC : 0; + nsize = MAX(osize, uio->uio_offset + uio->uio_resid); + endallocoff = nsize - blkoff(chmp, nsize); + + /* + * if we're increasing the file size, deal with expanding + * the fragment if there is one. + */ + + if (nsize > osize && lblkno(chmp, osize) < NDADDR && + lblkno(chmp, osize) != lblkno(chmp, nsize) && + blkroundup(chmp, osize) != osize) { + off_t eob; + + eob = blkroundup(chmp, osize); + uvm_vnp_setwritesize(vp, eob); + error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag); + if (error) + goto out; + if (flags & B_SYNC) { + mutex_enter(vp->v_interlock); + VOP_PUTPAGES(vp, + trunc_page(osize & chmp->chm_fs_bmask), + round_page(eob), + PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED); + } + } + + while (uio->uio_resid > 0) { + int ubc_flags = UBC_WRITE; + bool overwrite; /* if we're overwrite a whole block */ + off_t newoff; + + if (ioflag & IO_DIRECT) { + genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED); + } + + oldoff = uio->uio_offset; + blkoffset = blkoff(chmp, uio->uio_offset); + bytelen = MIN(chmp->chm_fs_bsize - blkoffset, uio->uio_resid); + if (bytelen == 0) { + break; + } + + /* + * if we're filling in a hole, allocate the blocks now and + * initialize the pages first. if we're extending the file, + * we can safely allocate blocks without initializing pages + * since the new blocks will be inaccessible until the write + * is complete. + */ + overwrite = uio->uio_offset >= preallocoff && + uio->uio_offset < endallocoff; + if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 && + blkoff(chmp, uio->uio_offset) == 0 && + (uio->uio_offset & PAGE_MASK) == 0) { + vsize_t len; + + len = trunc_page(bytelen); + len -= blkoff(chmp, len); + if (len > 0) { + overwrite = true; + bytelen = len; + } + } + + newoff = oldoff + bytelen; + if (vp->v_size < newoff) { + uvm_vnp_setwritesize(vp, newoff); + } + + if (!overwrite) { + error = ufs_balloc_range(vp, uio->uio_offset, bytelen, + cred, aflag); + if (error) + break; + } else { + genfs_node_wrlock(vp); + error = GOP_ALLOC(vp, uio->uio_offset, bytelen, + aflag, cred); + genfs_node_unlock(vp); + if (error) + break; + ubc_flags |= UBC_FAULTBUSY; + } + + /* + * copy the data. + */ + + ubc_flags |= UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0; + error = ubc_uiomove(&vp->v_uobj, uio, bytelen, + IO_ADV_DECODE(ioflag), ubc_flags); + + /* + * update UVM's notion of the size now that we've + * copied the data into the vnode's pages. + * + * we should update the size even when uiomove failed. + */ + + if (vp->v_size < newoff) { + uvm_vnp_setsize(vp, newoff); + extended = 1; + } + + if (error) + break; + + /* + * flush what we just wrote if necessary. + * XXXUBC simplistic async flushing. + */ + + if (!async && oldoff >> 16 != uio->uio_offset >> 16) { + mutex_enter(vp->v_interlock); + error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16, + (uio->uio_offset >> 16) << 16, + PGO_CLEANIT | PGO_JOURNALLOCKED); + if (error) + break; + } + } +out: + if (error == 0 && ioflag & IO_SYNC) { + mutex_enter(vp->v_interlock); + error = VOP_PUTPAGES(vp, + trunc_page(origoff & chmp->chm_fs_bmask), + round_page(blkroundup(chmp, uio->uio_offset)), + PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED); + } + ip->iflag |= IN_CHANGE | IN_UPDATE; + if (resid > uio->uio_resid && ap->a_cred && + kauth_authorize_generic(ap->a_cred, KAUTH_GENERIC_ISSUSER, NULL)) { + ip->mode &= ~(ISUID | ISGID); + } + if (resid > uio->uio_resid) + VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); + if (error) { + (void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred); + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC) + error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); + + //XXX hack, i write the next line after i know ip->i_size and vp->v_size don't equal + chfs_set_vnode_size(vp, vp->v_size); + + + //dbg("end file size (vp): %llu\n", (unsigned long long)vp->v_size); + //dbg("end file size (ip): %llu\n", (unsigned long long)ip->i_size); + KASSERT(vp->v_size == ip->size); + fstrans_done(vp->v_mount); + + mutex_enter(&chmp->chm_lock_mountfields); + error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL); + mutex_exit(&chmp->chm_lock_mountfields); + + //mutex_exit(&ip->inode_lock); + //dbg("end\n"); + return (error); +} + + +/* --------------------------------------------------------------------- */ + +int +chfs_fsync(void *v) +{ + //dbg("fsync\n"); + struct vop_fsync_args /* { + struct vnode *a_vp; + kauth_cred_t a_cred; + int a_flags; + off_t offlo; + off_t offhi; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + int wait; + + if (ap->a_flags & FSYNC_CACHE) { + return ENODEV; + } + wait = (ap->a_flags & FSYNC_WAIT) != 0; + vflushbuf(vp, wait); + //struct chfs_inode *ip = VTOI(vp); + //chfs_set_vnode_size(vp, ip->write_size); + + return 0; +} + +/* --------------------------------------------------------------------- */ + +int +chfs_remove(void *v) +{ + struct vnode *dvp = ((struct vop_remove_args *) v)->a_dvp; + struct vnode *vp = ((struct vop_remove_args *) v)->a_vp; + struct componentname *cnp = (((struct vop_remove_args *) v)->a_cnp); + dbg("remove\n"); + + KASSERT(VOP_ISLOCKED(dvp)); + KASSERT(VOP_ISLOCKED(vp)); + + struct chfs_inode *ip = VTOI(vp); + struct chfs_inode *parent = VTOI(dvp); + int error = 0; + + KASSERT(ip->chvc->vno != ip->chvc->pvno); + + error = chfs_do_unlink(ip, + parent, cnp->cn_nameptr, cnp->cn_namelen); + + vput(dvp); + vput(vp); + + return error; +} + +/* --------------------------------------------------------------------- */ + +int +chfs_link(void *v) +{ + struct vnode *dvp = ((struct vop_link_args *) v)->a_dvp; + struct vnode *vp = ((struct vop_link_args *) v)->a_vp; + struct componentname *cnp = ((struct vop_link_args *) v)->a_cnp; + + struct chfs_inode *ip, *parent; + int error = 0; + + if (vp->v_type == VDIR) { + VOP_ABORTOP(dvp, cnp); + error = EISDIR; + goto out; + } + if (dvp->v_mount != vp->v_mount) { + VOP_ABORTOP(dvp, cnp); + error = EXDEV; + goto out; + } + if (dvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE))) { + VOP_ABORTOP(dvp, cnp); + goto out; + } + + parent = VTOI(dvp); + ip = VTOI(vp); + + error = chfs_do_link(ip, + parent, cnp->cn_nameptr, cnp->cn_namelen, vp->v_type); + + if (dvp != vp) + VOP_UNLOCK(vp); +out: + vput(dvp); + return error; +} + +/* --------------------------------------------------------------------- */ + +int +chfs_rename(void *v) +{ + struct vnode *fdvp = ((struct vop_rename_args *) v)->a_fdvp; + struct vnode *fvp = ((struct vop_rename_args *) v)->a_fvp; + struct componentname *fcnp = ((struct vop_rename_args *) v)->a_fcnp; + struct vnode *tdvp = ((struct vop_rename_args *) v)->a_tdvp; + struct vnode *tvp = ((struct vop_rename_args *) v)->a_tvp; + struct componentname *tcnp = ((struct vop_rename_args *) v)->a_tcnp; + + struct chfs_inode *oldparent, *old; + struct chfs_inode *newparent; + struct chfs_dirent *fd;//, *oldfd; + struct chfs_inode *ip; + int error = 0; + dbg("rename\n"); + + KASSERT(VOP_ISLOCKED(tdvp)); + KASSERT(IMPLIES(tvp != NULL, VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); + + oldparent = VTOI(fdvp); + old = VTOI(fvp); + newparent = VTOI(tdvp); + if (tvp) { + dbg("tvp not null\n"); + ip = VTOI(tvp); + if (tvp->v_type == VDIR) { + //TODO: lock +// fd = ip->dents; +// while (fd) { + TAILQ_FOREACH(fd, &ip->dents, fds) { + if (fd->vno) { + //TODO: unlock + error = ENOTEMPTY; + goto out_unlocked; + } +// fd = fd->next; + } + //TODO: unlock + } + error = chfs_do_unlink(ip, + newparent, tcnp->cn_nameptr, tcnp->cn_namelen); + vput(tvp); + } + VFS_VGET(tdvp->v_mount, old->ino, &tvp); + ip = VTOI(tvp); + +// for (oldfd = oldparent->dents; +// oldfd->vno != old->ino; +// oldfd = oldfd->next); + + error = chfs_do_link(ip, + newparent, tcnp->cn_nameptr, tcnp->cn_namelen, tvp->v_type); + error = chfs_do_unlink(old, + oldparent, fcnp->cn_nameptr, fcnp->cn_namelen); + +//out: +// if (fchnode != tchnode) +// VOP_UNLOCK(fdvp, 0); + +out_unlocked: + // Release target nodes. + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp != NULL) + vput(tvp); + + // Release source nodes. + vrele(fdvp); + vrele(fvp); + + return error; +} + +/* --------------------------------------------------------------------- */ + +int +chfs_mkdir(void *v) +{ + struct vnode *dvp = ((struct vop_mkdir_args *) v)->a_dvp; + struct vnode **vpp = ((struct vop_mkdir_args *)v)->a_vpp; + struct componentname *cnp = ((struct vop_mkdir_args *) v)->a_cnp; + struct vattr *vap = ((struct vop_mkdir_args *) v)->a_vap; + dbg("mkdir()\n"); + + int mode; + + mode = vap->va_mode & ACCESSPERMS; + if ((mode & IFMT) == 0) { + mode |= IFDIR; + } + + KASSERT(vap->va_type == VDIR); + + return chfs_makeinode(mode, dvp, vpp, cnp, VDIR); +} + +/* --------------------------------------------------------------------- */ + +int +chfs_rmdir(void *v) +{ + struct vnode *dvp = ((struct vop_rmdir_args *) v)->a_dvp; + struct vnode *vp = ((struct vop_rmdir_args *) v)->a_vp; + struct componentname *cnp = ((struct vop_rmdir_args *) v)->a_cnp; + dbg("rmdir()\n"); + + KASSERT(VOP_ISLOCKED(dvp)); + KASSERT(VOP_ISLOCKED(vp)); + + struct chfs_inode *ip = VTOI(vp); + struct chfs_inode *parent = VTOI(dvp); + struct chfs_dirent *fd; + int error = 0; + + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + + KASSERT(ip->chvc->vno != ip->chvc->pvno); + +// for (fd = ip->dents; fd; fd = fd->next) { + TAILQ_FOREACH(fd, &ip->dents, fds) { + if (fd->vno) { + error = ENOTEMPTY; + goto out; + } + } + + error = chfs_do_unlink(ip, + parent, cnp->cn_nameptr, cnp->cn_namelen); + +out: + vput(dvp); + vput(vp); + + return error; +} + +/* --------------------------------------------------------------------- */ + +int +chfs_symlink(void *v) +{ + struct vnode *dvp = ((struct vop_symlink_args *) v)->a_dvp; + struct vnode **vpp = ((struct vop_symlink_args *) v)->a_vpp; + struct componentname *cnp = ((struct vop_symlink_args *) v)->a_cnp; + struct vattr *vap = ((struct vop_symlink_args *) v)->a_vap; + char *target = ((struct vop_symlink_args *) v)->a_target; + + struct ufsmount *ump; + struct chfs_mount *chmp; + struct vnode *vp; + struct chfs_inode *ip; + int len, err; + struct chfs_full_dnode *fd; + struct buf *bp; + dbg("symlink()\n"); + + ump = VFSTOUFS(dvp->v_mount); + chmp = ump->um_chfs; + + err = chfs_makeinode(IFLNK | vap->va_mode, dvp, vpp, cnp, VLNK); + if (err) + return (err); + VN_KNOTE(dvp, NOTE_WRITE); + vp = *vpp; + len = strlen(target); + ip = VTOI(vp); + /* TODO max symlink len instead of "100" */ + if (len < 100) { + ip->target = kmem_alloc(len, KM_SLEEP); + memcpy(ip->target, target, len); + chfs_set_vnode_size(vp, len); + ip->iflag |= IN_CHANGE | IN_UPDATE; + + bp = getiobuf(vp, true); + bp->b_bufsize = bp->b_resid = len; + bp->b_data = kmem_alloc(len, KM_SLEEP); + memcpy(bp->b_data, target, len); + bp->b_blkno = 0; + + fd = chfs_alloc_full_dnode(); + + mutex_enter(&chmp->chm_lock_mountfields); + + err = chfs_write_flash_dnode(chmp, vp, bp, fd); + if (err) { + mutex_exit(&chmp->chm_lock_mountfields); + goto out; + } + + err = chfs_add_full_dnode_to_inode(chmp, ip, fd); + if (err) { + mutex_exit(&chmp->chm_lock_mountfields); + goto out; + } + + mutex_exit(&chmp->chm_lock_mountfields); + + kmem_free(bp->b_data, len); + putiobuf(bp); + + uvm_vnp_setsize(vp, len); + } else { + err = vn_rdwr(UIO_WRITE, vp, target, len, (off_t)0, + UIO_SYSSPACE, IO_NODELOCKED, cnp->cn_cred, + (size_t *)0, NULL); + } + +out: + if (err) + vput(vp); + + return (err); +} + +/* --------------------------------------------------------------------- */ + +int +chfs_readdir(void *v) +{ + struct vnode *vp = ((struct vop_readdir_args *) v)->a_vp; + struct uio *uio = ((struct vop_readdir_args *) v)->a_uio; + int *eofflag = ((struct vop_readdir_args *) v)->a_eofflag; + + int error = 0; + off_t skip, offset; + struct chfs_inode *ip; + struct chfs_dirent *fd; + + struct ufsmount *ump; + struct chfs_mount *chmp; + struct chfs_vnode_cache *chvc; + + KASSERT(VOP_ISLOCKED(vp)); + + /* This operation only makes sense on directory nodes. */ + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + + ip = VTOI(vp); + + /* uiomove in chfs_filldir automatically increments the + * uio_offset by an arbitrary size, so we discard any change + * to uio_offset and set it to our own value on return + */ + offset = uio->uio_offset; + + if (offset == CHFS_OFFSET_DOT) { + error = chfs_filldir(uio, ip->ino, ".", 1, VDIR); + if (error == -1) { + error = 0; + goto outok; + } else if (error != 0) + goto outok; + + offset = CHFS_OFFSET_DOTDOT; + } + + if (offset == CHFS_OFFSET_DOTDOT) { + ump = VFSTOUFS(vp->v_mount); + chmp = ump->um_chfs; + mutex_enter(&chmp->chm_lock_vnocache); + chvc = chfs_vnode_cache_get(chmp, ip->ino); + mutex_exit(&chmp->chm_lock_vnocache); + + error = chfs_filldir(uio, chvc->pvno, "..", 2, VDIR); + if (error == -1) { + error = 0; + goto outok; + } else if (error != 0) { + goto outok; + } + + if (TAILQ_EMPTY(&ip->dents)) { + offset = CHFS_OFFSET_EOF; + } else { + offset = CHFS_OFFSET_FIRST; + } + } + + if (offset != CHFS_OFFSET_EOF) { + skip = offset - CHFS_OFFSET_FIRST; + + TAILQ_FOREACH(fd, &ip->dents, fds) { + /* seek to offset by skipping items */ + /* XXX race conditions by changed dirent? */ + if (skip > 0) { + skip--; + continue; + } + + if (fd->vno != 0) { + error = chfs_filldir(uio, fd->vno, + fd->name, fd->nsize, fd->type); + if (error == -1) { + error = 0; + goto outok; + } else if (error != 0) { + dbg("err %d\n", error); + goto outok; + } + } + offset++; + } + } + offset = CHFS_OFFSET_EOF; + +outok: + uio->uio_offset = offset; + + if (eofflag != NULL) { + *eofflag = (error == 0 && + uio->uio_offset == CHFS_OFFSET_EOF); + } + +out: + KASSERT(VOP_ISLOCKED(vp)); + + return error; +} + +/* --------------------------------------------------------------------- */ + +int +chfs_readlink(void *v) +{ + + struct vnode *vp = ((struct vop_readlink_args *) v)->a_vp; + struct uio *uio = ((struct vop_readlink_args *) v)->a_uio; + kauth_cred_t cred = ((struct vop_readlink_args *) v)->a_cred; + + struct chfs_inode *ip = VTOI(vp); + + dbg("readlink()\n"); + + /* TODO max symlink len instead of "100" */ + if (ip->size < 100) { + uiomove(ip->target, ip->size, uio); + return (0); + } + + return (VOP_READ(vp, uio, 0, cred)); +} + +/* --------------------------------------------------------------------- */ + +int +chfs_inactive(void *v) +{ + struct vnode *vp = ((struct vop_inactive_args *) v)->a_vp; + struct chfs_inode *ip = VTOI(vp); + struct chfs_vnode_cache *chvc; + dbg("inactive | vno: %llu\n", (unsigned long long)ip->ino); + + KASSERT(VOP_ISLOCKED(vp)); + + if (ip->ino) { + chvc = ip->chvc; + if (chvc->nlink) + *((struct vop_inactive_args *) v)->a_recycle = 0; + } else { + *((struct vop_inactive_args *) v)->a_recycle = 1; + } + + VOP_UNLOCK(vp); + + return 0; +} + +/* --------------------------------------------------------------------- */ + +int +chfs_reclaim(void *v) +{ + struct vop_reclaim_args *ap = v; + struct vnode *vp = ap->a_vp; + struct chfs_inode *ip = VTOI(vp); + struct chfs_mount *chmp = ip->chmp; + struct chfs_dirent *fd; + + //dbg("reclaim() | ino: %llu\n", (unsigned long long)ip->ino); + //mutex_enter(&ip->inode_lock); + + mutex_enter(&chmp->chm_lock_vnocache); + chfs_vnode_cache_set_state(chmp, + ip->chvc, VNO_STATE_CHECKEDABSENT); + mutex_exit(&chmp->chm_lock_vnocache); + + chfs_update(vp, NULL, NULL, UPDATE_CLOSE); + + if (vp->v_type == VREG || vp->v_type == VLNK || vp->v_type == VCHR || + vp->v_type == VBLK || vp->v_type == VFIFO || vp->v_type == VSOCK) + chfs_kill_fragtree(&ip->fragtree); + + fd = TAILQ_FIRST(&ip->dents); + while(fd) { + TAILQ_REMOVE(&ip->dents, fd, fds); + chfs_free_dirent(fd); + fd = TAILQ_FIRST(&ip->dents); + } + //mutex_exit(&ip->inode_lock); + //mutex_destroy(&ip->inode_lock); + + cache_purge(vp); + if (ip->devvp) { + vrele(ip->devvp); + ip->devvp = 0; + } + chfs_ihashrem(ip); + + genfs_node_destroy(vp); + pool_put(&chfs_inode_pool, vp->v_data); + vp->v_data = NULL; + return (0); +} + +/* --------------------------------------------------------------------- */ + +int +chfs_advlock(void *v) +{ + //struct vnode *vp = ((struct vop_advlock_args *) v)->a_vp; + dbg("advlock()\n"); + /* + struct chfs_node *node; + + node = VP_TO_CHFS_NODE(vp); + + return lf_advlock(v, &node->chn_lockf, node->chn_size); + */ + return 0; +} + +/* --------------------------------------------------------------------- */ +int +chfs_strategy(void *v) +{ + struct vop_strategy_args /* { + const struct vnodeop_desc *a_desc; + struct vnode *a_vp; + struct buf *a_bp; + } */ *ap = v; + struct chfs_full_dnode *fd; + struct buf *bp = ap->a_bp; + struct vnode *vp = ap->a_vp; + struct chfs_inode *ip = VTOI(vp); + struct chfs_mount *chmp = ip->chmp; + int read = (bp->b_flags & B_READ) ? 1 : 0; + int err = 0; + +/* dbg("bp dump:\n"); + dbg(" ->b_bcount: %d\n", bp->b_bcount); + dbg(" ->b_resid: %d\n", bp->b_resid); + dbg(" ->b_blkno: %llu\n", (unsigned long long)bp->b_blkno); + dbg(" ->b_error: %d\n", bp->b_error);*/ + if (read) { + err = chfs_read_data(chmp, vp, bp); + } else { + fd = chfs_alloc_full_dnode(); + + mutex_enter(&chmp->chm_lock_mountfields); + + err = chfs_write_flash_dnode(chmp, vp, bp, fd); + if (err) { + mutex_exit(&chmp->chm_lock_mountfields); + goto out; + } + + err = chfs_add_full_dnode_to_inode(chmp, ip, fd); + /*if (err) { + mutex_exit(&chmp->chm_lock_mountfields); + goto out; + }*/ + + mutex_exit(&chmp->chm_lock_mountfields); + } +out: + biodone(bp); + //dbg("end\n"); + return err; +} + +int +chfs_bmap(void *v) +{ + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + int *a_runb; + } */ *ap = v; + if (ap->a_vpp != NULL) + *ap->a_vpp = ap->a_vp; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + if (ap->a_runp != NULL) + *ap->a_runp = 0; + return (0); +} + +/* + * vnode operations vector used for files stored in a chfs file system. + */ +int +(**chfs_vnodeop_p)(void *); +const struct vnodeopv_entry_desc chfs_vnodeop_entries[] = + { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, chfs_lookup }, + { &vop_create_desc, chfs_create }, + { &vop_mknod_desc, chfs_mknod }, + { &vop_open_desc, chfs_open }, + { &vop_close_desc, chfs_close }, + { &vop_access_desc, chfs_access }, + { &vop_getattr_desc, chfs_getattr }, + { &vop_setattr_desc, chfs_setattr }, + { &vop_read_desc, chfs_read }, + { &vop_write_desc, chfs_write }, + { &vop_ioctl_desc, genfs_enoioctl }, + { &vop_fcntl_desc, genfs_fcntl }, + { &vop_poll_desc, genfs_poll }, + { &vop_kqfilter_desc, genfs_kqfilter }, + { &vop_revoke_desc, genfs_revoke }, + { &vop_mmap_desc, genfs_mmap }, + { &vop_fsync_desc, chfs_fsync }, + { &vop_seek_desc, genfs_seek }, + { &vop_remove_desc, chfs_remove }, + { &vop_link_desc, chfs_link }, + { &vop_rename_desc, chfs_rename }, + { &vop_mkdir_desc, chfs_mkdir }, + { &vop_rmdir_desc, chfs_rmdir }, + { &vop_symlink_desc, chfs_symlink }, + { &vop_readdir_desc, chfs_readdir }, + { &vop_readlink_desc, chfs_readlink }, + { &vop_abortop_desc, genfs_abortop }, + { &vop_inactive_desc, chfs_inactive }, + { &vop_reclaim_desc, chfs_reclaim }, + { &vop_lock_desc, genfs_lock }, + { &vop_unlock_desc, genfs_unlock }, + { &vop_bmap_desc, chfs_bmap }, + { &vop_strategy_desc, chfs_strategy }, + { &vop_print_desc, ufs_print }, + { &vop_pathconf_desc, ufs_pathconf }, + { &vop_islocked_desc, genfs_islocked }, + { &vop_advlock_desc, chfs_advlock }, + { &vop_bwrite_desc, vn_bwrite }, + { &vop_getpages_desc, genfs_getpages }, + { &vop_putpages_desc, genfs_putpages }, + { NULL, NULL } }; + +const struct vnodeopv_desc chfs_vnodeop_opv_desc = + { &chfs_vnodeop_p, chfs_vnodeop_entries }; + +/* --------------------------------------------------------------------- */ + +/* + * vnode operations vector used for special devices stored in a chfs + * file system. + */ +int +(**chfs_specop_p)(void *); +const struct vnodeopv_entry_desc chfs_specop_entries[] = + { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, spec_lookup }, + { &vop_create_desc, spec_create }, + { &vop_mknod_desc, spec_mknod }, + { &vop_open_desc, spec_open }, + { &vop_close_desc, ufsspec_close }, + { &vop_access_desc, chfs_access }, + { &vop_getattr_desc, chfs_getattr }, + { &vop_setattr_desc, chfs_setattr }, + { &vop_read_desc, chfs_read }, + { &vop_write_desc, chfs_write }, + { &vop_ioctl_desc, spec_ioctl }, + { &vop_fcntl_desc, genfs_fcntl }, + { &vop_poll_desc, spec_poll }, + { &vop_kqfilter_desc, spec_kqfilter }, + { &vop_revoke_desc, spec_revoke }, + { &vop_mmap_desc, spec_mmap }, + { &vop_fsync_desc, spec_fsync }, + { &vop_seek_desc, spec_seek }, + { &vop_remove_desc, spec_remove }, + { &vop_link_desc, spec_link }, + { &vop_rename_desc, spec_rename }, + { &vop_mkdir_desc, spec_mkdir }, + { &vop_rmdir_desc, spec_rmdir }, + { &vop_symlink_desc, spec_symlink }, + { &vop_readdir_desc, spec_readdir }, + { &vop_readlink_desc, spec_readlink }, + { &vop_abortop_desc, spec_abortop }, + { &vop_inactive_desc, chfs_inactive }, + { &vop_reclaim_desc, chfs_reclaim }, + { &vop_lock_desc, genfs_lock }, + { &vop_unlock_desc, genfs_unlock }, + { &vop_bmap_desc, spec_bmap }, + { &vop_strategy_desc, spec_strategy }, + { &vop_print_desc, ufs_print }, + { &vop_pathconf_desc, spec_pathconf }, + { &vop_islocked_desc, genfs_islocked }, + { &vop_advlock_desc, spec_advlock }, + { &vop_bwrite_desc, vn_bwrite }, + { &vop_getpages_desc, spec_getpages }, + { &vop_putpages_desc, spec_putpages }, + { NULL, NULL } }; + +const struct vnodeopv_desc chfs_specop_opv_desc = + { &chfs_specop_p, chfs_specop_entries }; + +/* --------------------------------------------------------------------- */ +/* + * vnode operations vector used for fifos stored in a chfs file system. + */ +int +(**chfs_fifoop_p)(void *); +const struct vnodeopv_entry_desc chfs_fifoop_entries[] = + { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, vn_fifo_bypass }, + { &vop_create_desc, vn_fifo_bypass }, + { &vop_mknod_desc, vn_fifo_bypass }, + { &vop_open_desc, vn_fifo_bypass }, + { &vop_close_desc, ufsfifo_close }, + { &vop_access_desc, chfs_access }, + { &vop_getattr_desc, chfs_getattr }, + { &vop_setattr_desc, chfs_setattr }, + { &vop_read_desc, ufsfifo_read }, + { &vop_write_desc, ufsfifo_write }, + { &vop_ioctl_desc, vn_fifo_bypass }, + { &vop_fcntl_desc, genfs_fcntl }, + { &vop_poll_desc, vn_fifo_bypass }, + { &vop_kqfilter_desc, vn_fifo_bypass }, + { &vop_revoke_desc, vn_fifo_bypass }, + { &vop_mmap_desc, vn_fifo_bypass }, + { &vop_fsync_desc, vn_fifo_bypass }, + { &vop_seek_desc, vn_fifo_bypass }, + { &vop_remove_desc, vn_fifo_bypass }, + { &vop_link_desc, vn_fifo_bypass }, + { &vop_rename_desc, vn_fifo_bypass }, + { &vop_mkdir_desc, vn_fifo_bypass }, + { &vop_rmdir_desc, vn_fifo_bypass }, + { &vop_symlink_desc, vn_fifo_bypass }, + { &vop_readdir_desc, vn_fifo_bypass }, + { &vop_readlink_desc, vn_fifo_bypass }, + { &vop_abortop_desc, vn_fifo_bypass }, + { &vop_inactive_desc, chfs_inactive }, + { &vop_reclaim_desc, chfs_reclaim }, + { &vop_lock_desc, genfs_lock }, + { &vop_unlock_desc, genfs_unlock }, + { &vop_bmap_desc, vn_fifo_bypass }, + { &vop_strategy_desc, vn_fifo_bypass }, + { &vop_print_desc, ufs_print }, + { &vop_pathconf_desc, vn_fifo_bypass }, + { &vop_islocked_desc, genfs_islocked }, + { &vop_advlock_desc, vn_fifo_bypass }, + { &vop_bwrite_desc, genfs_nullop }, + { &vop_getpages_desc, genfs_badop }, + { &vop_putpages_desc, vn_fifo_bypass }, + { NULL, NULL } }; + +const struct vnodeopv_desc chfs_fifoop_opv_desc = + { &chfs_fifoop_p, chfs_fifoop_entries }; diff --git a/sys/ufs/chfs/chfs_wbuf.c b/sys/ufs/chfs/chfs_wbuf.c new file mode 100644 index 000000000..c9823a696 --- /dev/null +++ b/sys/ufs/chfs/chfs_wbuf.c @@ -0,0 +1,259 @@ +/* $NetBSD: chfs_wbuf.c,v 1.2 2011/11/24 20:50:33 agc Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * Copyright (C) 2010 Tamas Toth + * Copyright (C) 2010 Adam Hoka + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include "chfs.h" +//#include + +#define DBG_WBUF 1 + +#define PAD(x) (((x)+3)&~3) + +#define EB_ADDRESS(x) ( ((unsigned long)(x) / chmp->chm_ebh->eb_size) * chmp->chm_ebh->eb_size ) + +#define PAGE_DIV(x) ( ((unsigned long)(x) / (unsigned long)(chmp->chm_wbuf_pagesize)) * (unsigned long)(chmp->chm_wbuf_pagesize) ) +#define PAGE_MOD(x) ( (unsigned long)(x) % (unsigned long)(chmp->chm_wbuf_pagesize) ) + +/* +// test functions +int wbuf_test(void); +void wbuf_test_erase_flash(struct chfs_mount*); +void wbuf_test_callback(struct erase_instruction*); +*/ + +#define NOPAD 0 +#define SETPAD 1 + + +/** + * chfs_flush_wbuf - write wbuf to the flash + * @chmp: super block info + * @pad: padding (NOPAD / SETPAD) + * Returns zero in case of success. + */ +static int +chfs_flush_wbuf(struct chfs_mount *chmp, int pad) +{ + int ret=0; + size_t retlen = 0; + + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + KASSERT(mutex_owned(&chmp->chm_lock_sizes)); + KASSERT(rw_write_held(&chmp->chm_lock_wbuf)); + + if (pad) { + chmp->chm_wbuf_len = PAD(chmp->chm_wbuf_len); + memset(chmp->chm_wbuf + chmp->chm_wbuf_len, 0, chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len); + + struct chfs_flash_padding_node* padnode = (void*)(chmp->chm_wbuf + chmp->chm_wbuf_len); + padnode->magic = htole16(CHFS_FS_MAGIC_BITMASK); + padnode->type = htole16(CHFS_NODETYPE_PADDING); + padnode->length = htole32(chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len); + padnode->hdr_crc = htole32(crc32(0, (uint8_t *)padnode, sizeof(*padnode)-4)); + + struct chfs_node_ref *nref; + nref = chfs_alloc_node_ref(chmp->chm_nextblock); + nref->nref_offset = chmp->chm_wbuf_ofs + chmp->chm_wbuf_len; + nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) | + CHFS_OBSOLETE_NODE_MASK; + chmp->chm_wbuf_len = chmp->chm_wbuf_pagesize; + + chfs_change_size_free(chmp, chmp->chm_nextblock, -padnode->length); + chfs_change_size_wasted(chmp, chmp->chm_nextblock, padnode->length); + } + + ret = chfs_write_leb(chmp, chmp->chm_nextblock->lnr, chmp->chm_wbuf, chmp->chm_wbuf_ofs, chmp->chm_wbuf_len, &retlen); + if(ret) { + return ret; + } + + memset(chmp->chm_wbuf,0xff,chmp->chm_wbuf_pagesize); + chmp->chm_wbuf_ofs += chmp->chm_wbuf_pagesize; + chmp->chm_wbuf_len = 0; + return 0; +} + + +/** + * chfs_fill_wbuf - write to wbuf + * @chmp: super block info + * @buf: buffer + * @len: buffer length + * Return the len of the buf what we didn't write to the wbuf. + */ +static size_t +chfs_fill_wbuf(struct chfs_mount *chmp, const u_char *buf, size_t len) +{ + if (len && !chmp->chm_wbuf_len && (len >= chmp->chm_wbuf_pagesize)) { + return 0; + } + if (len > (chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len)) { + len = chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len; + } + memcpy(chmp->chm_wbuf + chmp->chm_wbuf_len, buf, len); + + chmp->chm_wbuf_len += (int) len; + return len; +} + +/** + * chfs_write_wbuf - write to wbuf and then the flash + * @chmp: super block info + * @invecs: io vectors + * @count: num of vectors + * @to: offset of target + * @retlen: writed bytes + * Returns zero in case of success. + */ +int +chfs_write_wbuf(struct chfs_mount* chmp, const struct iovec *invecs, long count, + off_t to, size_t *retlen) +{ + int invec, ret = 0; + size_t wbuf_retlen, donelen = 0; + int outvec_to = to; + + int lnr = chmp->chm_nextblock->lnr; + + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + KASSERT(mutex_owned(&chmp->chm_lock_sizes)); + KASSERT(!rw_write_held(&chmp->chm_lock_wbuf)); + + rw_enter(&chmp->chm_lock_wbuf, RW_WRITER); + + //dbg("1. wbuf ofs: %zu, len: %zu\n", chmp->chm_wbuf_ofs, chmp->chm_wbuf_len); + + if (chmp->chm_wbuf_ofs == 0xffffffff) { + chmp->chm_wbuf_ofs = PAGE_DIV(to); + chmp->chm_wbuf_len = PAGE_MOD(to); + memset(chmp->chm_wbuf, 0xff, chmp->chm_wbuf_pagesize); + } + + //dbg("2. wbuf ofs: %zu, len: %zu\n", chmp->chm_wbuf_ofs, chmp->chm_wbuf_len); + + if (EB_ADDRESS(to) != EB_ADDRESS(chmp->chm_wbuf_ofs)) { + if (chmp->chm_wbuf_len) { + ret = chfs_flush_wbuf(chmp, SETPAD); + if (ret) + goto outerr; + } + chmp->chm_wbuf_ofs = PAGE_DIV(to); + chmp->chm_wbuf_len = PAGE_MOD(to); + } + + //dbg("3. wbuf ofs: %zu, len: %zu\n", chmp->chm_wbuf_ofs, chmp->chm_wbuf_len); + + if (to != PAD(chmp->chm_wbuf_ofs + chmp->chm_wbuf_len)) { + dbg("to: %llu != %zu\n", (unsigned long long)to, + PAD(chmp->chm_wbuf_ofs + chmp->chm_wbuf_len)); + dbg("Non-contiguous write\n"); + panic("BUG\n"); + } + + /* adjust alignment offset */ + if (chmp->chm_wbuf_len != PAGE_MOD(to)) { + chmp->chm_wbuf_len = PAGE_MOD(to); + /* take care of alignement to next page*/ + if (!chmp->chm_wbuf_len) { + chmp->chm_wbuf_len += chmp->chm_wbuf_pagesize; + ret = chfs_flush_wbuf(chmp, NOPAD); + if (ret) + goto outerr; + } + } + + for (invec = 0; invec < count; invec++) { + int vlen = invecs[invec].iov_len; + u_char* v = invecs[invec].iov_base; + + //dbg("invec:%d len:%d\n", invec, vlen); + + wbuf_retlen = chfs_fill_wbuf(chmp, v, vlen); + if (chmp->chm_wbuf_len == chmp->chm_wbuf_pagesize) { + ret = chfs_flush_wbuf(chmp, NOPAD); + if (ret) { + goto outerr; + } + } + vlen -= wbuf_retlen; + outvec_to += wbuf_retlen; + v += wbuf_retlen; + donelen += wbuf_retlen; + if (vlen >= chmp->chm_wbuf_pagesize) { + ret = chfs_write_leb(chmp, lnr, v, outvec_to, PAGE_DIV(vlen), &wbuf_retlen); + //dbg("fd->write: %zu\n", wbuf_retlen); + vlen -= wbuf_retlen; + outvec_to += wbuf_retlen; + chmp->chm_wbuf_ofs = outvec_to; + v += wbuf_retlen; + donelen += wbuf_retlen; + } + wbuf_retlen = chfs_fill_wbuf(chmp, v, vlen); + if (chmp->chm_wbuf_len == chmp->chm_wbuf_pagesize) { + ret = chfs_flush_wbuf(chmp, NOPAD); + if (ret) + goto outerr; + } + + // if we write the last vector, we flush with padding + /*if (invec == count-1) { + ret = chfs_flush_wbuf(chmp, SETPAD); + if (ret) + goto outerr; + }*/ + outvec_to += wbuf_retlen; + donelen += wbuf_retlen; + } + *retlen = donelen; + rw_exit(&chmp->chm_lock_wbuf); + return ret; + +outerr: + *retlen = 0; + return ret; +} + +int chfs_flush_pending_wbuf(struct chfs_mount *chmp) +{ + //dbg("flush pending wbuf\n"); + int err; + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + mutex_enter(&chmp->chm_lock_sizes); + rw_enter(&chmp->chm_lock_wbuf, RW_WRITER); + err = chfs_flush_wbuf(chmp, SETPAD); + rw_exit(&chmp->chm_lock_wbuf); + mutex_exit(&chmp->chm_lock_sizes); + return err; +} diff --git a/sys/ufs/chfs/chfs_write.c b/sys/ufs/chfs/chfs_write.c new file mode 100644 index 000000000..0838ed9b1 --- /dev/null +++ b/sys/ufs/chfs/chfs_write.c @@ -0,0 +1,545 @@ +/* $NetBSD: chfs_write.c,v 1.2 2011/11/24 21:09:37 agc Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * Copyright (C) 2010 David Tengeri + * Copyright (C) 2010 Tamas Toth + * Copyright (C) 2010 Adam Hoka + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * chfs_write.c + * + * Created on: 2010.02.17. + * Author: dtengeri + */ + +#include +#include + +#include "chfs.h" + +int +chfs_write_flash_vnode(struct chfs_mount *chmp, + struct chfs_inode *ip, int prio) +{ + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + struct chfs_flash_vnode *fvnode; + struct chfs_vnode_cache* chvc; + struct chfs_node_ref *nref; + struct iovec vec; + size_t size, retlen; + int err = 0, retries = 0; + + if (ip->ino == CHFS_ROOTINO) + return 0; + + fvnode = chfs_alloc_flash_vnode(); + if (!fvnode) + return ENOMEM; + + chvc = ip->chvc; + + /* setting up flash_vnode members */ + size = sizeof(*fvnode); + //dbg("size: %zu | PADDED: %zu\n", size, CHFS_PAD(size)); + fvnode->magic = htole16(CHFS_FS_MAGIC_BITMASK); + fvnode->type = htole16(CHFS_NODETYPE_VNODE); + fvnode->length = htole32(CHFS_PAD(size)); + fvnode->hdr_crc = htole32(crc32(0, (uint8_t *)fvnode, + CHFS_NODE_HDR_SIZE - 4)); + fvnode->vno = htole64(ip->ino); + fvnode->version = htole64(++ip->chvc->highest_version); + fvnode->mode = htole32(ip->mode); + fvnode->dn_size = htole32(ip->size); + fvnode->atime = htole32(ip->atime); + fvnode->ctime = htole32(ip->ctime); + fvnode->mtime = htole32(ip->mtime); + fvnode->gid = htole32(ip->gid); + fvnode->uid = htole32(ip->uid); + fvnode->node_crc = htole32(crc32(0, (uint8_t *)fvnode, size - 4)); + + /* write out flash_vnode */ +retry: + if (prio == ALLOC_GC) { + /* the GC calls this function */ + err = chfs_reserve_space_gc(chmp, CHFS_PAD(size)); + if (err) + goto out; + } else { + chfs_gc_trigger(chmp); + if (prio == ALLOC_NORMAL) + err = chfs_reserve_space_normal(chmp, + CHFS_PAD(size), ALLOC_NORMAL); + else + err = chfs_reserve_space_normal(chmp, + CHFS_PAD(size), ALLOC_DELETION); + if (err) + goto out; + } + + nref = chfs_alloc_node_ref(chmp->chm_nextblock); + if (!nref) { + err = ENOMEM; + goto out; + } + + mutex_enter(&chmp->chm_lock_sizes); + + nref->nref_offset = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size; + chfs_change_size_free(chmp, chmp->chm_nextblock, -CHFS_PAD(size)); + vec.iov_base = fvnode; + vec.iov_len = CHFS_PAD(size); + err = chfs_write_wbuf(chmp, &vec, 1, nref->nref_offset, &retlen); + if (err || retlen != CHFS_PAD(size)) { + chfs_err("error while writing out flash vnode to the media\n"); + chfs_err("err: %d | size: %zu | retlen : %zu\n", + err, CHFS_PAD(size), retlen); + chfs_change_size_dirty(chmp, + chmp->chm_nextblock, CHFS_PAD(size)); + if (retries) { + err = EIO; + mutex_exit(&chmp->chm_lock_sizes); + goto out; + } + + retries++; + mutex_exit(&chmp->chm_lock_sizes); + goto retry; + } + //Everything went well + chfs_change_size_used(chmp, + &chmp->chm_blocks[nref->nref_lnr], CHFS_PAD(size)); + mutex_exit(&chmp->chm_lock_sizes); + + chfs_add_vnode_ref_to_vc(chmp, chvc, nref); + KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size); +out: + chfs_free_flash_vnode(fvnode); + return err; +} + +int +chfs_write_flash_dirent(struct chfs_mount *chmp, struct chfs_inode *pdir, + struct chfs_inode *ip, struct chfs_dirent *fd, + ino_t ino, int prio) +{ + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + struct chfs_flash_dirent_node *fdirent; + struct chfs_node_ref *nref; + struct iovec vec[2]; + size_t size, retlen; + int err = 0, retries = 0; + uint8_t *name; + size_t namelen; + + KASSERT(fd->vno != CHFS_ROOTINO); + + fdirent = chfs_alloc_flash_dirent(); + if (!fdirent) + return ENOMEM; + + size = sizeof(*fdirent) + fd->nsize; + namelen = CHFS_PAD(size) - sizeof(*fdirent); + + name = kmem_zalloc(namelen, KM_SLEEP); + memcpy(name, fd->name, fd->nsize); + //dbg("namelen: %zu | nsize: %hhu\n", namelen, fd->nsize); + + + //dbg("size: %zu | PADDED: %zu\n", size, CHFS_PAD(size)); + fdirent->magic = htole16(CHFS_FS_MAGIC_BITMASK); + fdirent->type = htole16(CHFS_NODETYPE_DIRENT); + fdirent->length = htole32(CHFS_PAD(size)); + fdirent->hdr_crc = htole32(crc32(0, (uint8_t *)fdirent, + CHFS_NODE_HDR_SIZE - 4)); + fdirent->vno = htole64(ino); + fdirent->pvno = htole64(pdir->ino); + fdirent->version = htole64(++pdir->chvc->highest_version); + fdirent->mctime = ip?ip->ctime:0; + fdirent->nsize = fd->nsize; + fdirent->dtype = fd->type; + fdirent->name_crc = crc32(0, (uint8_t *)&(fd->name), fd->nsize); + fdirent->node_crc = crc32(0, (uint8_t *)fdirent, sizeof(*fdirent) - 4); + + vec[0].iov_base = fdirent; + vec[0].iov_len = sizeof(*fdirent); + vec[1].iov_base = name; + vec[1].iov_len = namelen; + +retry: + if (prio == ALLOC_GC) { + /* the GC calls this function */ + err = chfs_reserve_space_gc(chmp, CHFS_PAD(size)); + if (err) + goto out; + } else { + chfs_gc_trigger(chmp); + if (prio == ALLOC_NORMAL) + err = chfs_reserve_space_normal(chmp, + CHFS_PAD(size), ALLOC_NORMAL); + else + err = chfs_reserve_space_normal(chmp, + CHFS_PAD(size), ALLOC_DELETION); + if (err) + goto out; + } + + nref = chfs_alloc_node_ref(chmp->chm_nextblock); + if (!nref) { + err = ENOMEM; + goto out; + } + + mutex_enter(&chmp->chm_lock_sizes); + + nref->nref_offset = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size; + chfs_change_size_free(chmp, chmp->chm_nextblock, -CHFS_PAD(size)); + + err = chfs_write_wbuf(chmp, vec, 2, nref->nref_offset, &retlen); + if (err || retlen != CHFS_PAD(size)) { + chfs_err("error while writing out flash dirent node to the media\n"); + chfs_err("err: %d | size: %zu | retlen : %zu\n", + err, CHFS_PAD(size), retlen); + chfs_change_size_dirty(chmp, + chmp->chm_nextblock, CHFS_PAD(size)); + if (retries) { + err = EIO; + mutex_exit(&chmp->chm_lock_sizes); + goto out; + } + + retries++; + mutex_exit(&chmp->chm_lock_sizes); + goto retry; + } + + + // Everything went well + chfs_change_size_used(chmp, + &chmp->chm_blocks[nref->nref_lnr], CHFS_PAD(size)); + mutex_exit(&chmp->chm_lock_sizes); + KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size); + fd->nref = nref; + if (prio != ALLOC_DELETION) { + chfs_add_node_to_list(chmp, + pdir->chvc, nref, &pdir->chvc->dirents); + } +out: + chfs_free_flash_dirent(fdirent); + return err; +} + +/** + * chfs_write_flash_dnode - write out a data node to flash + * @chmp: chfs mount structure + * @vp: vnode where the data belongs to + * @bp: buffer contains data + */ +int +chfs_write_flash_dnode(struct chfs_mount *chmp, struct vnode *vp, + struct buf *bp, struct chfs_full_dnode *fd) +{ + KASSERT(mutex_owned(&chmp->chm_lock_mountfields)); + + int err = 0, retries = 0; + size_t size, retlen; + off_t ofs; + struct chfs_flash_data_node *dnode; + struct chfs_node_ref *nref; + struct chfs_inode *ip = VTOI(vp); + struct iovec vec[2]; + uint32_t len; + void *tmpbuf = NULL; + + KASSERT(ip->ino != CHFS_ROOTINO); + + dnode = chfs_alloc_flash_dnode(); + if (!dnode) + return ENOMEM; + + /* initialize flash data node */ + ofs = bp->b_blkno * PAGE_SIZE; + //dbg("vp->v_size: %ju, bp->b_blkno: %ju, bp-b_data: %p," + // " bp->b_resid: %ju\n", + // (uintmax_t )vp->v_size, (uintmax_t )bp->b_blkno, + // bp->b_data, (uintmax_t )bp->b_resid); + //dbg("[XXX]vp->v_size - ofs: %llu\n", (vp->v_size - ofs)); + len = MIN((vp->v_size - ofs), bp->b_resid); + size = sizeof(*dnode) + len; + + dnode->magic = htole16(CHFS_FS_MAGIC_BITMASK); + dnode->type = htole16(CHFS_NODETYPE_DATA); + dnode->length = htole32(CHFS_PAD(size)); + dnode->hdr_crc = htole32(crc32(0, (uint8_t *)dnode, + CHFS_NODE_HDR_SIZE - 4)); + dnode->vno = htole64(ip->ino); + dnode->version = htole64(++ip->chvc->highest_version); + dnode->offset = htole64(ofs); + dnode->data_length = htole32(len); + dnode->data_crc = htole32(crc32(0, (uint8_t *)bp->b_data, len)); + dnode->node_crc = htole32(crc32(0, (uint8_t *)dnode, + sizeof(*dnode) - 4)); + + dbg("dnode @%llu %ub v%llu\n", (unsigned long long)dnode->offset, + dnode->data_length, (unsigned long long)dnode->version); + + if (CHFS_PAD(size) - sizeof(*dnode)) { + tmpbuf = kmem_zalloc(CHFS_PAD(size) + - sizeof(*dnode), KM_SLEEP); + memcpy(tmpbuf, bp->b_data, len); + } + + /* creating iovecs for wbuf */ + vec[0].iov_base = dnode; + vec[0].iov_len = sizeof(*dnode); + vec[1].iov_base = tmpbuf; + vec[1].iov_len = CHFS_PAD(size) - sizeof(*dnode); + + fd->frags = 0; + fd->ofs = ofs; + fd->size = len; + +retry: + + /* Reserve space for data node. This will set up the next eraseblock + * where to we will write. + */ + + chfs_gc_trigger(chmp); + err = chfs_reserve_space_normal(chmp, + CHFS_PAD(size), ALLOC_NORMAL); + if (err) + goto out; + + nref = chfs_alloc_node_ref(chmp->chm_nextblock); + if (!nref) { + err = ENOMEM; + goto out; + } + + nref->nref_offset = + chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size; + + KASSERT(nref->nref_offset < chmp->chm_ebh->eb_size); + + mutex_enter(&chmp->chm_lock_sizes); + + chfs_change_size_free(chmp, + chmp->chm_nextblock, -CHFS_PAD(size)); + + //dbg("vno: %llu nref lnr: %u offset: %u\n", + // dnode->vno, nref->nref_lnr, nref->nref_offset); + + err = chfs_write_wbuf(chmp, vec, 2, nref->nref_offset, &retlen); + if (err || retlen != CHFS_PAD(size)) { + chfs_err("error while writing out flash data node to the media\n"); + chfs_err("err: %d | size: %zu | retlen : %zu\n", + err, size, retlen); + chfs_change_size_dirty(chmp, + chmp->chm_nextblock, CHFS_PAD(size)); + if (retries) { + err = EIO; + mutex_exit(&chmp->chm_lock_sizes); + goto out; + } + + retries++; + mutex_exit(&chmp->chm_lock_sizes); + goto retry; + } + /* Everything went well */ + ip->write_size += fd->size; + chfs_change_size_used(chmp, + &chmp->chm_blocks[nref->nref_lnr], CHFS_PAD(size)); + mutex_exit(&chmp->chm_lock_sizes); + + KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size); + fd->nref = nref; + chfs_add_node_to_list(chmp, ip->chvc, nref, &ip->chvc->dnode); +out: + chfs_free_flash_dnode(dnode); + if (CHFS_PAD(size) - sizeof(*dnode)) { + kmem_free(tmpbuf, CHFS_PAD(size) - sizeof(*dnode)); + } + + return err; +} + +/** + * chfs_do_link - makes a copy from a node + * @old: old node + * @oldfd: dirent of old node + * @parent: parent of new node + * @name: name of new node + * @namelen: length of name + * This function writes the dirent of the new node to the media. + */ +int +chfs_do_link(struct chfs_inode *ip, struct chfs_inode *parent, const char *name, int namelen, enum vtype type) +{ + int error = 0; + struct vnode *vp = ITOV(ip); + struct ufsmount *ump = VFSTOUFS(vp->v_mount); + struct chfs_mount *chmp = ump->um_chfs; + struct chfs_dirent *newfd = NULL; +// struct chfs_dirent *fd = NULL; + + //dbg("link vno: %llu\n", ip->ino); + + newfd = chfs_alloc_dirent(namelen + 1); + + newfd->vno = ip->ino; + newfd->type = type; + newfd->nsize = namelen; + memcpy(newfd->name, name, namelen); + newfd->name[newfd->nsize] = 0; +// newfd->next = NULL; + + ip->chvc->nlink++; + parent->chvc->nlink++; + ip->iflag |= IN_CHANGE; + chfs_update(vp, NULL, NULL, UPDATE_WAIT); + + mutex_enter(&chmp->chm_lock_mountfields); + + error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL); + if (error) + return error; + + error = chfs_write_flash_dirent(chmp, + parent, ip, newfd, ip->ino, ALLOC_NORMAL); + /* TODO: what should we do if error isn't zero? */ + + mutex_exit(&chmp->chm_lock_mountfields); + + /* add fd to the fd list */ + TAILQ_INSERT_TAIL(&parent->dents, newfd, fds); +#if 0 + fd = parent->dents; + if (!fd) { + parent->dents = newfd; + } else { + while (fd->next) + fd = fd->next; + fd->next = newfd; + } +#endif + + return error; +} + + +/** + * chfs_do_unlink - delete a node + * @ip: node what we'd like to delete + * @parent: parent of the node + * @name: name of the node + * @namelen: length of name + * This function set the nlink and vno of the node zero and write its dirent to the media. + */ +int +chfs_do_unlink(struct chfs_inode *ip, + struct chfs_inode *parent, const char *name, int namelen) +{ + struct chfs_dirent *fd, *tmpfd; + int error = 0; + struct vnode *vp = ITOV(ip); + struct ufsmount *ump = VFSTOUFS(vp->v_mount); + struct chfs_mount *chmp = ump->um_chfs; + struct chfs_node_ref *nref; + + //dbg("unlink vno: %llu\n", ip->ino); + + vflushbuf(vp, 0); + + mutex_enter(&chmp->chm_lock_mountfields); + + /* remove the full direntry from the parent dents list */ + TAILQ_FOREACH_SAFE(fd, &parent->dents, fds, tmpfd) { + if (fd->vno == ip->ino && + fd->nsize == namelen && + !memcmp(fd->name, name, fd->nsize)) { + if (fd->type == VDIR && ip->chvc->nlink == 2) + ip->chvc->nlink = 0; + else + ip->chvc->nlink--; + + fd->type = VNON; + + TAILQ_REMOVE(&parent->dents, fd, fds); + + /* remove nref from dirents list */ + nref = parent->chvc->dirents; + if (nref == fd->nref) { + nref->nref_next = fd->nref->nref_next; + } else { + while (nref->nref_next && nref->nref_next != fd->nref) + nref = nref->nref_next; + if (nref->nref_next) + nref->nref_next = fd->nref->nref_next; + } + + //dbg("FD->NREF vno: %llu, lnr: %u, ofs: %u\n", + // fd->vno, fd->nref->nref_lnr, fd->nref->nref_offset); + chfs_mark_node_obsolete(chmp, fd->nref); + + error = chfs_write_flash_dirent(chmp, + parent, ip, fd, 0, ALLOC_DELETION); + + //dbg("FD->NREF vno: %llu, lnr: %u, ofs: %u\n", + // fd->vno, fd->nref->nref_lnr, fd->nref->nref_offset); + chfs_mark_node_obsolete(chmp, fd->nref); + + nref = ip->chvc->dnode; + while (nref != (struct chfs_node_ref *)ip->chvc) { + //dbg("DATA NREF\n"); + chfs_mark_node_obsolete(chmp, nref); + nref = nref->nref_next; + } + ip->chvc->dnode = (struct chfs_node_ref *)ip->chvc; + + nref = ip->chvc->v; + while (nref != (struct chfs_node_ref *)ip->chvc) { + //dbg("V NREF\n"); + chfs_mark_node_obsolete(chmp, nref); + nref = nref->nref_next; + } + ip->chvc->v = ip->chvc->v->nref_next; + + parent->chvc->nlink--; + //TODO: if error + } + } + mutex_exit(&chmp->chm_lock_mountfields); + + return error; +} diff --git a/sys/ufs/chfs/debug.c b/sys/ufs/chfs/debug.c new file mode 100644 index 000000000..0d1fa5b52 --- /dev/null +++ b/sys/ufs/chfs/debug.c @@ -0,0 +1,48 @@ +/* $NetBSD: debug.c,v 1.1 2011/11/24 15:51:32 ahoka Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * XipFFS -- Xip Flash File System + * + * Copyright (C) 2009 Ferenc Havasi , + * Zoltan Sogor , + * ... + * University of Szeged, Hungary + * + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include "chfs.h" +//#include + diff --git a/include/ufs/chfs/debug.h b/sys/ufs/chfs/debug.h similarity index 100% rename from include/ufs/chfs/debug.h rename to sys/ufs/chfs/debug.h diff --git a/sys/ufs/chfs/ebh.c b/sys/ufs/chfs/ebh.c new file mode 100644 index 000000000..ff0d984ee --- /dev/null +++ b/sys/ufs/chfs/ebh.c @@ -0,0 +1,2141 @@ +/* $NetBSD: ebh.c,v 1.2 2011/11/25 11:15:24 ahoka Exp $ */ + +/*- + * Copyright (c) 2010 Department of Software Engineering, + * University of Szeged, Hungary + * Copyright (C) 2009 Ferenc Havasi + * Copyright (C) 2009 Zoltan Sogor + * Copyright (C) 2009 David Tengeri + * Copyright (C) 2009 Tamas Toth + * Copyright (C) 2010 Adam Hoka + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the Department of Software Engineering, University of Szeged, Hungary + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "ebh.h" + +/*****************************************************************************/ +/* Flash specific operations */ +/*****************************************************************************/ +int nor_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr); +int nand_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr); +int nor_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset); +int nand_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset); +int nor_read_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr); +int nand_read_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr); +int nor_write_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr); +int nand_write_eb_hdr(struct chfs_ebh *ebh, int pebnr,struct chfs_eb_hdr *ebhdr); +int nor_check_eb_hdr(struct chfs_ebh *ebh, void *buf); +int nand_check_eb_hdr(struct chfs_ebh *ebh, void *buf); +int nor_mark_eb_hdr_dirty_flash(struct chfs_ebh *ebh, int pebnr, int lid); +int nor_invalidate_eb_hdr(struct chfs_ebh *ebh, int pebnr); +int mark_eb_hdr_free(struct chfs_ebh *ebh, int pebnr, int ec); + +int ltree_entry_cmp(struct chfs_ltree_entry *le1, struct chfs_ltree_entry *le2); +int peb_in_use_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2); +int peb_free_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2); +int add_peb_to_erase_queue(struct chfs_ebh *ebh, int pebnr, int ec,struct peb_queue *queue); +struct chfs_peb * find_peb_in_use(struct chfs_ebh *ebh, int pebnr); +int add_peb_to_free(struct chfs_ebh *ebh, int pebnr, int ec); +int add_peb_to_in_use(struct chfs_ebh *ebh, int pebnr, int ec); +void erase_callback(struct flash_erase_instruction *ei); +int free_peb(struct chfs_ebh *ebh); +int release_peb(struct chfs_ebh *ebh, int pebnr); +void erase_thread(void *data); +static void erase_thread_start(struct chfs_ebh *ebh); +static void erase_thread_stop(struct chfs_ebh *ebh); +int scan_leb_used_cmp(struct chfs_scan_leb *sleb1, struct chfs_scan_leb *sleb2); +int nor_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si,struct chfs_eb_hdr *ebhdr, int pebnr, int leb_status); +int nor_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si, + int pebnr, struct chfs_eb_hdr *ebhdr); +int nand_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si,struct chfs_eb_hdr *ebhdr, int pebnr); +int nand_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si, + int pebnr, struct chfs_eb_hdr *ebhdr); +struct chfs_scan_info *chfs_scan(struct chfs_ebh *ebh); +void scan_info_destroy(struct chfs_scan_info *si); +int scan_media(struct chfs_ebh *ebh); +int get_peb(struct chfs_ebh *ebh); +/** + * nor_create_eb_hdr - creates an eraseblock header for NOR flash + * @ebhdr: ebhdr to set + * @lnr: LEB number + */ +int +nor_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr) +{ + ebhdr->u.nor_hdr.lid = htole32(lnr); + return 0; +} + +/** + * nand_create_eb_hdr - creates an eraseblock header for NAND flash + * @ebhdr: ebhdr to set + * @lnr: LEB number + */ +int +nand_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr) +{ + ebhdr->u.nand_hdr.lid = htole32(lnr); + return 0; +} + +/** + * nor_calc_data_offs - calculates data offset on NOR flash + * @ebh: chfs eraseblock handler + * @pebnr: eraseblock number + * @offset: offset within the eraseblock + */ +int +nor_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset) +{ + return pebnr * ebh->flash_if->erasesize + offset + + CHFS_EB_EC_HDR_SIZE + CHFS_EB_HDR_NOR_SIZE; +} + +/** + * nand_calc_data_offs - calculates data offset on NAND flash + * @ebh: chfs eraseblock handler + * @pebnr: eraseblock number + * @offset: offset within the eraseblock + */ +int +nand_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset) +{ + return pebnr * ebh->flash_if->erasesize + offset + + 2 * ebh->flash_if->page_size; +} + +/** + * nor_read_eb_hdr - read ereaseblock header from NOR flash + * + * @ebh: chfs eraseblock handler + * @pebnr: eraseblock number + * @ebhdr: whereto store the data + * + * Reads the eraseblock header from media. + * Returns zero in case of success, error code in case of fail. + */ +int +nor_read_eb_hdr(struct chfs_ebh *ebh, + int pebnr, struct chfs_eb_hdr *ebhdr) +{ + int ret; + size_t retlen; + off_t ofs = pebnr * ebh->flash_if->erasesize; + + KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr); + + ret = flash_read(ebh->flash_dev, + ofs, CHFS_EB_EC_HDR_SIZE, + &retlen, (unsigned char *) &ebhdr->ec_hdr); + + if (ret || retlen != CHFS_EB_EC_HDR_SIZE) + return ret; + + ofs += CHFS_EB_EC_HDR_SIZE; + ret = flash_read(ebh->flash_dev, + ofs, CHFS_EB_HDR_NOR_SIZE, + &retlen, (unsigned char *) &ebhdr->u.nor_hdr); + + if (ret || retlen != CHFS_EB_HDR_NOR_SIZE) + return ret; + + return 0; +} + +/** + * nand_read_eb_hdr - read ereaseblock header from NAND flash + * + * @ebh: chfs eraseblock handler + * @pebnr: eraseblock number + * @ebhdr: whereto store the data + * + * Reads the eraseblock header from media. It is on the first two page. + * Returns zero in case of success, error code in case of fail. + */ +int +nand_read_eb_hdr(struct chfs_ebh *ebh, int pebnr, + struct chfs_eb_hdr *ebhdr) +{ + int ret; + size_t retlen; + off_t ofs; + + KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr); + + /* Read erase counter header from the first page. */ + ofs = pebnr * ebh->flash_if->erasesize; + ret = flash_read(ebh->flash_dev, + ofs, CHFS_EB_EC_HDR_SIZE, &retlen, + (unsigned char *) &ebhdr->ec_hdr); + if (ret || retlen != CHFS_EB_EC_HDR_SIZE) + return ret; + + /* Read NAND eraseblock header from the second page */ + ofs += ebh->flash_if->page_size; + ret = flash_read(ebh->flash_dev, + ofs, CHFS_EB_HDR_NAND_SIZE, &retlen, + (unsigned char *) &ebhdr->u.nand_hdr); + if (ret || retlen != CHFS_EB_HDR_NAND_SIZE) + return ret; + + return 0; +} + +/** + * nor_write_eb_hdr - write ereaseblock header to NOR flash + * + * @ebh: chfs eraseblock handler + * @pebnr: eraseblock number whereto write + * @ebh: ebh to write + * + * Writes the eraseblock header to media. + * Returns zero in case of success, error code in case of fail. + */ +int +nor_write_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr) +{ + int ret, crc; + size_t retlen; + + off_t ofs = pebnr * ebh->flash_if->erasesize + CHFS_EB_EC_HDR_SIZE; + + ebhdr->u.nor_hdr.lid = ebhdr->u.nor_hdr.lid + | htole32(CHFS_LID_NOT_DIRTY_BIT); + + crc = crc32(0, (uint8_t *)&ebhdr->u.nor_hdr + 4, + CHFS_EB_HDR_NOR_SIZE - 4); + ebhdr->u.nand_hdr.crc = htole32(crc); + + KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr); + + ret = flash_write(ebh->flash_dev, + ofs, CHFS_EB_HDR_NOR_SIZE, &retlen, + (unsigned char *) &ebhdr->u.nor_hdr); + + if (ret || retlen != CHFS_EB_HDR_NOR_SIZE) + return ret; + + return 0; +} + +/** + * nand_write_eb_hdr - write ereaseblock header to NAND flash + * + * @ebh: chfs eraseblock handler + * @pebnr: eraseblock number whereto write + * @ebh: ebh to write + * + * Writes the eraseblock header to media. + * Returns zero in case of success, error code in case of fail. + */ +int +nand_write_eb_hdr(struct chfs_ebh *ebh, int pebnr, + struct chfs_eb_hdr *ebhdr) +{ + int ret, crc; + size_t retlen; + flash_off_t ofs; + + KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr); + + ofs = pebnr * ebh->flash_if->erasesize + + ebh->flash_if->page_size; + + ebhdr->u.nand_hdr.serial = htole64(++(*ebh->max_serial)); + + crc = crc32(0, (uint8_t *)&ebhdr->u.nand_hdr + 4, + CHFS_EB_HDR_NAND_SIZE - 4); + ebhdr->u.nand_hdr.crc = htole32(crc); + + ret = flash_write(ebh->flash_dev, ofs, + CHFS_EB_HDR_NAND_SIZE, &retlen, + (unsigned char *) &ebhdr->u.nand_hdr); + + if (ret || retlen != CHFS_EB_HDR_NAND_SIZE) + return ret; + + return 0; +} + +/** + * nor_check_eb_hdr - check ereaseblock header read from NOR flash + * + * @ebh: chfs eraseblock handler + * @buf: eraseblock header to check + * + * Returns eraseblock header status. + */ +int +nor_check_eb_hdr(struct chfs_ebh *ebh, void *buf) +{ + uint32_t magic, crc, hdr_crc; + struct chfs_eb_hdr *ebhdr = buf; + le32 lid_save; + + //check is there a header + if (check_pattern((void *) &ebhdr->ec_hdr, + 0xFF, 0, CHFS_EB_EC_HDR_SIZE)) { + dbg_ebh("no header found\n"); + return EBHDR_LEB_NO_HDR; + } + + // check magic + magic = le32toh(ebhdr->ec_hdr.magic); + if (magic != CHFS_MAGIC_BITMASK) { + dbg_ebh("bad magic bitmask(exp: %x found %x)\n", + CHFS_MAGIC_BITMASK, magic); + return EBHDR_LEB_BADMAGIC; + } + + // check CRC_EC + hdr_crc = le32toh(ebhdr->ec_hdr.crc_ec); + crc = crc32(0, (uint8_t *) &ebhdr->ec_hdr + 8, 4); + if (hdr_crc != crc) { + dbg_ebh("bad crc_ec found\n"); + return EBHDR_LEB_BADCRC; + } + + /* check if the PEB is free: magic, crc_ec and erase_cnt is good and + * everything else is FFF.. + */ + if (check_pattern((void *) &ebhdr->u.nor_hdr, 0xFF, 0, + CHFS_EB_HDR_NOR_SIZE)) { + dbg_ebh("free peb found\n"); + return EBHDR_LEB_FREE; + } + + // check invalidated (CRC == LID == 0) + if (ebhdr->u.nor_hdr.crc == 0 && ebhdr->u.nor_hdr.lid == 0) { + dbg_ebh("invalidated ebhdr found\n"); + return EBHDR_LEB_INVALIDATED; + } + + // check CRC + hdr_crc = le32toh(ebhdr->u.nor_hdr.crc); + lid_save = ebhdr->u.nor_hdr.lid; + + // mark lid as not dirty for crc calc + ebhdr->u.nor_hdr.lid = ebhdr->u.nor_hdr.lid | htole32( + CHFS_LID_NOT_DIRTY_BIT); + crc = crc32(0, (uint8_t *) &ebhdr->u.nor_hdr + 4, + CHFS_EB_HDR_NOR_SIZE - 4); + // restore the original lid value in ebh + ebhdr->u.nor_hdr.lid = lid_save; + + if (crc != hdr_crc) { + dbg_ebh("bad crc found\n"); + return EBHDR_LEB_BADCRC; + } + + // check dirty + if (!(le32toh(lid_save) & CHFS_LID_NOT_DIRTY_BIT)) { + dbg_ebh("dirty ebhdr found\n"); + return EBHDR_LEB_DIRTY; + } + + return EBHDR_LEB_OK; +} + +/** + * nand_check_eb_hdr - check ereaseblock header read from NAND flash + * + * @ebh: chfs eraseblock handler + * @buf: eraseblock header to check + * + * Returns eraseblock header status. + */ +int +nand_check_eb_hdr(struct chfs_ebh *ebh, void *buf) +{ + uint32_t magic, crc, hdr_crc; + struct chfs_eb_hdr *ebhdr = buf; + + //check is there a header + if (check_pattern((void *) &ebhdr->ec_hdr, + 0xFF, 0, CHFS_EB_EC_HDR_SIZE)) { + dbg_ebh("no header found\n"); + return EBHDR_LEB_NO_HDR; + } + + // check magic + magic = le32toh(ebhdr->ec_hdr.magic); + if (magic != CHFS_MAGIC_BITMASK) { + dbg_ebh("bad magic bitmask(exp: %x found %x)\n", + CHFS_MAGIC_BITMASK, magic); + return EBHDR_LEB_BADMAGIC; + } + + // check CRC_EC + hdr_crc = le32toh(ebhdr->ec_hdr.crc_ec); + crc = crc32(0, (uint8_t *) &ebhdr->ec_hdr + 8, 4); + if (hdr_crc != crc) { + dbg_ebh("bad crc_ec found\n"); + return EBHDR_LEB_BADCRC; + } + + /* check if the PEB is free: magic, crc_ec and erase_cnt is good and + * everything else is FFF.. + */ + if (check_pattern((void *) &ebhdr->u.nand_hdr, 0xFF, 0, + CHFS_EB_HDR_NAND_SIZE)) { + dbg_ebh("free peb found\n"); + return EBHDR_LEB_FREE; + } + + // check CRC + hdr_crc = le32toh(ebhdr->u.nand_hdr.crc); + + crc = crc32(0, (uint8_t *) &ebhdr->u.nand_hdr + 4, + CHFS_EB_HDR_NAND_SIZE - 4); + + if (crc != hdr_crc) { + dbg_ebh("bad crc found\n"); + return EBHDR_LEB_BADCRC; + } + + return EBHDR_LEB_OK; +} + +/** + * nor_mark_eb_hdr_dirty_flash- mark ereaseblock header dirty on NOR flash + * + * @ebh: chfs eraseblock handler + * @pebnr: eraseblock number + * @lid: leb id (it's bit number 31 will be set to 0) + * + * It pulls the CHFS_LID_NOT_DIRTY_BIT to zero on flash. + * + * Returns zero in case of success, error code in case of fail. + */ +int +nor_mark_eb_hdr_dirty_flash(struct chfs_ebh *ebh, int pebnr, int lid) +{ + int ret; + size_t retlen; + off_t ofs; + + /* mark leb id dirty */ + lid = htole32(lid & CHFS_LID_DIRTY_BIT_MASK); + + /* calculate position */ + ofs = pebnr * ebh->flash_if->erasesize + CHFS_EB_EC_HDR_SIZE + + CHFS_GET_MEMBER_POS(struct chfs_nor_eb_hdr , lid); + + ret = flash_write(ebh->flash_dev, ofs, sizeof(lid), &retlen, + (unsigned char *) &lid); + if (ret || retlen != sizeof(lid)) { + chfs_err("can't mark peb dirty"); + return ret; + } + + return 0; +} + +/** + * nor_invalidate_eb_hdr - invalidate ereaseblock header on NOR flash + * + * @ebh: chfs eraseblock handler + * @pebnr: eraseblock number + * + * Sets crc and lip field to zero. + * Returns zero in case of success, error code in case of fail. + */ +int +nor_invalidate_eb_hdr(struct chfs_ebh *ebh, int pebnr) +{ + int ret; + size_t retlen; + off_t ofs; + char zero_buf[CHFS_INVALIDATE_SIZE]; + + /* fill with zero */ + memset(zero_buf, 0x0, CHFS_INVALIDATE_SIZE); + + /* calculate position (!!! lid is directly behind crc !!!) */ + ofs = pebnr * ebh->flash_if->erasesize + CHFS_EB_EC_HDR_SIZE + + CHFS_GET_MEMBER_POS(struct chfs_nor_eb_hdr, crc); + + ret = flash_write(ebh->flash_dev, + ofs, CHFS_INVALIDATE_SIZE, &retlen, + (unsigned char *) &zero_buf); + if (ret || retlen != CHFS_INVALIDATE_SIZE) { + chfs_err("can't invalidate peb"); + return ret; + } + + return 0; +} + +/** + * mark_eb_hdr_free - free ereaseblock header on NOR or NAND flash + * + * @ebh: chfs eraseblock handler + * @pebnr: eraseblock number + * @ec: erase counter of PEB + * + * Write out the magic and erase counter to the physical eraseblock. + * Returns zero in case of success, error code in case of fail. + */ +int +mark_eb_hdr_free(struct chfs_ebh *ebh, int pebnr, int ec) +{ + int ret, crc; + size_t retlen; + off_t ofs; + struct chfs_eb_hdr *ebhdr; + ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP); + + ebhdr->ec_hdr.magic = htole32(CHFS_MAGIC_BITMASK); + ebhdr->ec_hdr.erase_cnt = htole32(ec); + crc = crc32(0, (uint8_t *) &ebhdr->ec_hdr + 8, 4); + ebhdr->ec_hdr.crc_ec = htole32(crc); + + ofs = pebnr * ebh->flash_if->erasesize; + + KASSERT(sizeof(ebhdr->ec_hdr) == CHFS_EB_EC_HDR_SIZE); + + ret = flash_write(ebh->flash_dev, + ofs, CHFS_EB_EC_HDR_SIZE, &retlen, + (unsigned char *) &ebhdr->ec_hdr); + + if (ret || retlen != CHFS_EB_EC_HDR_SIZE) { + chfs_err("can't mark peb as free: %d\n", pebnr); + kmem_free(ebhdr, sizeof(struct chfs_eb_hdr)); + return ret; + } + + kmem_free(ebhdr, sizeof(struct chfs_eb_hdr)); + return 0; +} + +/*****************************************************************************/ +/* End of Flash specific operations */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* Lock Tree */ +/*****************************************************************************/ + +int +ltree_entry_cmp(struct chfs_ltree_entry *le1, + struct chfs_ltree_entry *le2) +{ + return (le1->lnr - le2->lnr); +} + +/* Generate functions for Lock tree's red-black tree */ +RB_PROTOTYPE( ltree_rbtree, chfs_ltree_entry, rb, ltree_entry_cmp); +RB_GENERATE( ltree_rbtree, chfs_ltree_entry, rb, ltree_entry_cmp); + + +/** + * ltree_lookup - looks up a logical eraseblock in the lock tree + * @ebh: chfs eraseblock handler + * @lid: identifier of the logical eraseblock + * + * This function returns a pointer to the wanted &struct chfs_ltree_entry + * if the logical eraseblock is in the lock tree, so it is locked, NULL + * otherwise. + * @ebh->ltree_lock has to be locked! + */ +static struct chfs_ltree_entry * +ltree_lookup(struct chfs_ebh *ebh, int lnr) +{ + struct chfs_ltree_entry le, *result; + le.lnr = lnr; + result = RB_FIND(ltree_rbtree, &ebh->ltree, &le); + return result; +} + +/** + * ltree_add_entry - add an entry to the lock tree + * @ebh: chfs eraseblock handler + * @lnr: identifier of the logical eraseblock + * + * This function adds a new logical eraseblock entry identified with @lnr to the + * lock tree. If the entry is already in the tree, it increases the user + * counter. + * Returns NULL if can not allocate memory for lock tree entry, or a pointer + * to the inserted entry otherwise. + */ +static struct chfs_ltree_entry * +ltree_add_entry(struct chfs_ebh *ebh, int lnr) +{ + struct chfs_ltree_entry *le, *result; + + le = kmem_alloc(sizeof(struct chfs_ltree_entry), KM_SLEEP); + + le->lnr = lnr; + le->users = 1; + rw_init(&le->mutex); + + //dbg_ebh("enter ltree lock\n"); + mutex_enter(&ebh->ltree_lock); + //dbg_ebh("insert\n"); + result = RB_INSERT(ltree_rbtree, &ebh->ltree, le); + //dbg_ebh("inserted\n"); + if (result) { + //The entry is already in the tree + result->users++; + kmem_free(le, sizeof(struct chfs_ltree_entry)); + } + else { + result = le; + } + mutex_exit(&ebh->ltree_lock); + + return result; +} + +/** + * leb_read_lock - lock a logical eraseblock for read + * @ebh: chfs eraseblock handler + * @lnr: identifier of the logical eraseblock + * + * Returns zero in case of success, error code in case of fail. + */ +static int +leb_read_lock(struct chfs_ebh *ebh, int lnr) +{ + struct chfs_ltree_entry *le; + + le = ltree_add_entry(ebh, lnr); + if (!le) + return ENOMEM; + + rw_enter(&le->mutex, RW_READER); + return 0; +} + +/** + * leb_read_unlock - unlock a logical eraseblock from read + * @ebh: chfs eraseblock handler + * @lnr: identifier of the logical eraseblock + * + * This function unlocks a logical eraseblock from read and delete it from the + * lock tree is there are no more users of it. + */ +static void +leb_read_unlock(struct chfs_ebh *ebh, int lnr) +{ + struct chfs_ltree_entry *le; + + mutex_enter(&ebh->ltree_lock); + //dbg_ebh("LOCK: ebh->ltree_lock spin locked in leb_read_unlock()\n"); + le = ltree_lookup(ebh, lnr); + if (!le) + goto out; + + le->users -= 1; + KASSERT(le->users >= 0); + rw_exit(&le->mutex); + if (le->users == 0) { + le = RB_REMOVE(ltree_rbtree, &ebh->ltree, le); + if (le) { + KASSERT(!rw_lock_held(&le->mutex)); + rw_destroy(&le->mutex); + + kmem_free(le, sizeof(struct chfs_ltree_entry)); + } + } + +out: + mutex_exit(&ebh->ltree_lock); + //dbg_ebh("UNLOCK: ebh->ltree_lock spin unlocked in leb_read_unlock()\n"); +} + +/** + * leb_write_lock - lock a logical eraseblock for write + * @ebh: chfs eraseblock handler + * @lnr: identifier of the logical eraseblock + * + * Returns zero in case of success, error code in case of fail. + */ +static int +leb_write_lock(struct chfs_ebh *ebh, int lnr) +{ + struct chfs_ltree_entry *le; + + le = ltree_add_entry(ebh, lnr); + if (!le) + return ENOMEM; + + rw_enter(&le->mutex, RW_WRITER); + return 0; +} + +/** + * leb_write_unlock - unlock a logical eraseblock from write + * @ebh: chfs eraseblock handler + * @lnr: identifier of the logical eraseblock + * + * This function unlocks a logical eraseblock from write and delete it from the + * lock tree is there are no more users of it. + */ +static void +leb_write_unlock(struct chfs_ebh *ebh, int lnr) +{ + struct chfs_ltree_entry *le; + + mutex_enter(&ebh->ltree_lock); + //dbg_ebh("LOCK: ebh->ltree_lock spin locked in leb_write_unlock()\n"); + le = ltree_lookup(ebh, lnr); + if (!le) + goto out; + + le->users -= 1; + KASSERT(le->users >= 0); + rw_exit(&le->mutex); + if (le->users == 0) { + RB_REMOVE(ltree_rbtree, &ebh->ltree, le); + + KASSERT(!rw_lock_held(&le->mutex)); + rw_destroy(&le->mutex); + + kmem_free(le, sizeof(struct chfs_ltree_entry)); + } + +out: + mutex_exit(&ebh->ltree_lock); + //dbg_ebh("UNLOCK: ebh->ltree_lock spin unlocked in leb_write_unlock()\n"); +} + +/*****************************************************************************/ +/* End of Lock Tree */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* Erase related operations */ +/*****************************************************************************/ + +/** + * If the first argument is smaller than the second, the function + * returns a value smaller than zero. If they are equal, the function re- + * turns zero. Otherwise, it should return a value greater than zero. + */ +int +peb_in_use_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2) +{ + return (peb1->pebnr - peb2->pebnr); +} + +int +peb_free_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2) +{ + int comp; + + comp = peb1->erase_cnt - peb2->erase_cnt; + if (0 == comp) + comp = peb1->pebnr - peb2->pebnr; + + return comp; +} + +/* Generate functions for in use PEB's red-black tree */ +RB_PROTOTYPE(peb_in_use_rbtree, chfs_peb, u.rb, peb_in_use_cmp); +RB_GENERATE(peb_in_use_rbtree, chfs_peb, u.rb, peb_in_use_cmp); +RB_PROTOTYPE(peb_free_rbtree, chfs_peb, u.rb, peb_free_cmp); +RB_GENERATE(peb_free_rbtree, chfs_peb, u.rb, peb_free_cmp); + +/** + * add_peb_to_erase_queue: adds a PEB to to_erase/fully_erased queue + * @ebh - chfs eraseblock handler + * @pebnr - physical eraseblock's number + * @ec - erase counter of PEB + * @queue: the queue to add to + * + * This function adds a PEB to the erase queue specified by @queue. + * The @ebh->erase_lock must be locked before using this. + * Returns zero in case of success, error code in case of fail. + */ +int +add_peb_to_erase_queue(struct chfs_ebh *ebh, int pebnr, int ec, + struct peb_queue *queue) +{ + struct chfs_peb *peb; + + peb = kmem_alloc(sizeof(struct chfs_peb), KM_SLEEP); + + peb->erase_cnt = ec; + peb->pebnr = pebnr; + + TAILQ_INSERT_TAIL(queue, peb, u.queue); + + return 0; + +} +//TODO +/** + * find_peb_in_use - looks up a PEB in the RB-tree of used blocks + * @ebh - chfs eraseblock handler + * + * This function returns a pointer to the PEB found in the tree, + * NULL otherwise. + * The @ebh->erase_lock must be locked before using this. + */ +struct chfs_peb * +find_peb_in_use(struct chfs_ebh *ebh, int pebnr) +{ + struct chfs_peb peb, *result; + peb.pebnr = pebnr; + result = RB_FIND(peb_in_use_rbtree, &ebh->in_use, &peb); + return result; +} + +/** + * add_peb_to_free - adds a PEB to the RB-tree of free PEBs + * @ebh - chfs eraseblock handler + * @pebnr - physical eraseblock's number + * @ec - erase counter of PEB + * + * + * This function adds a physical eraseblock to the RB-tree of free PEBs + * stored in the @ebh. The key is the erase counter and pebnr. + * The @ebh->erase_lock must be locked before using this. + * Returns zero in case of success, error code in case of fail. + */ +int +add_peb_to_free(struct chfs_ebh *ebh, int pebnr, int ec) +{ + struct chfs_peb *peb, *result; + + peb = kmem_alloc(sizeof(struct chfs_peb), KM_SLEEP); + + peb->erase_cnt = ec; + peb->pebnr = pebnr; + result = RB_INSERT(peb_free_rbtree, &ebh->free, peb); + if (result) + return 1; + + return 0; +} + +/** + * add_peb_to_in_use - adds a PEB to the RB-tree of used PEBs + * @ebh - chfs eraseblock handler + * @pebnr - physical eraseblock's number + * @ec - erase counter of PEB + * + * + * This function adds a physical eraseblock to the RB-tree of used PEBs + * stored in the @ebh. The key is pebnr. + * The @ebh->erase_lock must be locked before using this. + * Returns zero in case of success, error code in case of fail. + */ +int +add_peb_to_in_use(struct chfs_ebh *ebh, int pebnr, int ec) +{ + struct chfs_peb *peb, *result; + + peb = kmem_alloc(sizeof(struct chfs_peb), KM_SLEEP); + + peb->erase_cnt = ec; + peb->pebnr = pebnr; + result = RB_INSERT(peb_in_use_rbtree, &ebh->in_use, peb); + if (result) + return 1; + + return 0; +} + +/** + * erase_callback - callback function for flash erase + * @ei: erase information + */ +void +erase_callback(struct flash_erase_instruction *ei) +{ + int err; + struct chfs_erase_info_priv *priv = (void *) ei->ei_priv; + //dbg_ebh("ERASE_CALLBACK() CALLED\n"); + struct chfs_ebh *ebh = priv->ebh; + struct chfs_peb *peb = priv->peb; + + peb->erase_cnt += 1; + + if (ei->ei_state == FLASH_ERASE_DONE) { + + /* Write out erase counter */ + err = ebh->ops->mark_eb_hdr_free(ebh, + peb->pebnr, peb->erase_cnt); + if (err) { + /* cannot mark PEB as free,so erase it again */ + chfs_err( + "cannot mark eraseblock as free, PEB: %d\n", + peb->pebnr); + mutex_enter(&ebh->erase_lock); + /*dbg_ebh("LOCK: ebh->erase_lock spin locked in erase_callback() " + "after mark ebhdr free\n");*/ + add_peb_to_erase_queue(ebh, peb->pebnr, peb->erase_cnt, + &ebh->to_erase); + mutex_exit(&ebh->erase_lock); + /*dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_callback() " + "after mark ebhdr free\n");*/ + kmem_free(peb, sizeof(struct chfs_peb)); + return; + } + + mutex_enter(&ebh->erase_lock); + /*dbg_ebh("LOCK: ebh->erase_lock spin locked in erase_callback()\n");*/ + err = add_peb_to_free(ebh, peb->pebnr, peb->erase_cnt); + mutex_exit(&ebh->erase_lock); + /*dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_callback()\n");*/ + kmem_free(peb, sizeof(struct chfs_peb)); + } else { + /* + * Erase is finished, but there was a problem, + * so erase PEB again + */ + chfs_err("erase failed, state is: 0x%x\n", ei->ei_state); + add_peb_to_erase_queue(ebh, peb->pebnr, peb->erase_cnt, &ebh->to_erase); + kmem_free(peb, sizeof(struct chfs_peb)); + } +} + +/** + * free_peb: free a PEB + * @ebh: chfs eraseblock handler + * + * This function erases the first physical eraseblock from one of the erase + * lists and adds to the RB-tree of free PEBs. + * Returns zero in case of succes, error code in case of fail. + */ +int +free_peb(struct chfs_ebh *ebh) +{ + int err, retries = 0; + off_t ofs; + struct chfs_peb *peb = NULL; + struct flash_erase_instruction *ei; + + KASSERT(mutex_owned(&ebh->erase_lock)); + + if (!TAILQ_EMPTY(&ebh->fully_erased)) { + //dbg_ebh("[FREE PEB] got a fully erased block\n"); + peb = TAILQ_FIRST(&ebh->fully_erased); + TAILQ_REMOVE(&ebh->fully_erased, peb, u.queue); + err = ebh->ops->mark_eb_hdr_free(ebh, + peb->pebnr, peb->erase_cnt); + if (err) { + goto out_free; + } + err = add_peb_to_free(ebh, peb->pebnr, peb->erase_cnt); + goto out_free; + } + /* Erase PEB */ + //dbg_ebh("[FREE PEB] eraseing a block\n"); + peb = TAILQ_FIRST(&ebh->to_erase); + TAILQ_REMOVE(&ebh->to_erase, peb, u.queue); + mutex_exit(&ebh->erase_lock); + //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in free_peb()\n"); + ofs = peb->pebnr * ebh->flash_if->erasesize; + + /* XXX where do we free this? */ + ei = kmem_alloc(sizeof(struct flash_erase_instruction) + + sizeof(struct chfs_erase_info_priv), KM_SLEEP); +retry: + memset(ei, 0, sizeof(*ei)); + +// ei->ei_if = ebh->flash_if; + ei->ei_addr = ofs; + ei->ei_len = ebh->flash_if->erasesize; + ei->ei_callback = erase_callback; + ei->ei_priv = (unsigned long) (&ei[1]); + + ((struct chfs_erase_info_priv *) ei->ei_priv)->ebh = ebh; + ((struct chfs_erase_info_priv *) ei->ei_priv)->peb = peb; + + err = flash_erase(ebh->flash_dev, ei); + dbg_ebh("erased peb: %d\n", peb->pebnr); + + /* einval would mean we did something wrong */ + KASSERT(err != EINVAL); + + if (err) { + dbg_ebh("errno: %d, ei->ei_state: %d\n", err, ei->ei_state); + if (CHFS_MAX_GET_PEB_RETRIES < ++retries && + ei->ei_state == FLASH_ERASE_FAILED) { + /* The block went bad mark it */ + dbg_ebh("ebh markbad! 0x%jx\n", (uintmax_t )ofs); + err = flash_block_markbad(ebh->flash_dev, ofs); + if (!err) { + ebh->peb_nr--; + } + + goto out; + } + chfs_err("can not erase PEB: %d, try again\n", peb->pebnr); + goto retry; + } + +out: + /* lock the erase_lock, because it was locked + * when the function was called */ + mutex_enter(&ebh->erase_lock); + return err; + +out_free: + kmem_free(peb, sizeof(struct chfs_peb)); + return err; +} + +/** + * release_peb - schedule an erase for the PEB + * @ebh: chfs eraseblock handler + * @pebnr: physical eraseblock number + * + * This function get the peb identified by @pebnr from the in_use RB-tree of + * @ebh, removes it and schedule an erase for it. + * + * Returns zero on success, error code in case of fail. + */ +int +release_peb(struct chfs_ebh *ebh, int pebnr) +{ + int err = 0; + struct chfs_peb *peb; + + mutex_enter(&ebh->erase_lock); + + //dbg_ebh("LOCK: ebh->erase_lock spin locked in release_peb()\n"); + peb = find_peb_in_use(ebh, pebnr); + if (!peb) { + chfs_err("LEB is mapped, but is not in the 'in_use' " + "tree of ebh\n"); + goto out_unlock; + } + err = add_peb_to_erase_queue(ebh, peb->pebnr, peb->erase_cnt, + &ebh->to_erase); + + if (err) + goto out_unlock; + + RB_REMOVE(peb_in_use_rbtree, &ebh->in_use, peb); +out_unlock: + mutex_exit(&ebh->erase_lock); + //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in release_peb()" + // " at out_unlock\n"); + return err; +} + +/** + * erase_thread - background thread for erasing PEBs + * @data: pointer to the eraseblock handler + */ +/*void + erase_thread(void *data) + { + struct chfs_ebh *ebh = data; + + dbg_ebh("erase thread started\n"); + while (ebh->bg_erase.eth_running) { + int err; + + mutex_enter(&ebh->erase_lock); + dbg_ebh("LOCK: ebh->erase_lock spin locked in erase_thread()\n"); + if (TAILQ_EMPTY(&ebh->to_erase) && TAILQ_EMPTY(&ebh->fully_erased)) { + dbg_ebh("thread has nothing to do\n"); + mutex_exit(&ebh->erase_lock); + mutex_enter(&ebh->bg_erase.eth_thread_mtx); + cv_timedwait_sig(&ebh->bg_erase.eth_wakeup, + &ebh->bg_erase.eth_thread_mtx, mstohz(100)); + mutex_exit(&ebh->bg_erase.eth_thread_mtx); + + dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_thread()\n"); + continue; + } + mutex_exit(&ebh->erase_lock); + dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_thread()\n"); + + err = free_peb(ebh); + if (err) + chfs_err("freeing PEB failed in the background thread: %d\n", err); + + } + dbg_ebh("erase thread stopped\n"); + kthread_exit(0); + }*/ + +/** + * erase_thread - background thread for erasing PEBs + * @data: pointer to the eraseblock handler + */ +void +erase_thread(void *data) { + dbg_ebh("[EBH THREAD] erase thread started\n"); + + struct chfs_ebh *ebh = data; + int err; + + mutex_enter(&ebh->erase_lock); + while (ebh->bg_erase.eth_running) { + if (TAILQ_EMPTY(&ebh->to_erase) && + TAILQ_EMPTY(&ebh->fully_erased)) { + cv_timedwait_sig(&ebh->bg_erase.eth_wakeup, + &ebh->erase_lock, mstohz(100)); + } else { + /* XXX exiting this mutex is a bit odd here as + * free_peb instantly reenters it... + */ + err = free_peb(ebh); + mutex_exit(&ebh->erase_lock); + if (err) { + chfs_err("freeing PEB failed in the" + " background thread: %d\n", err); + } + mutex_enter(&ebh->erase_lock); + } + } + mutex_exit(&ebh->erase_lock); + + dbg_ebh("[EBH THREAD] erase thread stopped\n"); + kthread_exit(0); +} + +/** + * erase_thread_start - init and start erase thread + * @ebh: eraseblock handler + */ +static void +erase_thread_start(struct chfs_ebh *ebh) +{ + cv_init(&ebh->bg_erase.eth_wakeup, "ebheracv"); + + ebh->bg_erase.eth_running = true; + kthread_create(PRI_NONE, KTHREAD_MPSAFE | KTHREAD_MUSTJOIN, NULL, + erase_thread, ebh, &ebh->bg_erase.eth_thread, "ebherase"); +} + +/** + * erase_thread_stop - stop background erase thread + * @ebh: eraseblock handler + */ +static void +erase_thread_stop(struct chfs_ebh *ebh) +{ + ebh->bg_erase.eth_running = false; + cv_signal(&ebh->bg_erase.eth_wakeup); + dbg_ebh("[EBH THREAD STOP] signaled\n"); + + kthread_join(ebh->bg_erase.eth_thread); +#ifdef BROKEN_KTH_JOIN + kpause("chfsebhjointh", false, mstohz(1000), NULL); +#endif + + cv_destroy(&ebh->bg_erase.eth_wakeup); +} + +/*****************************************************************************/ +/* End of Erase related operations */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* Scan related operations */ +/*****************************************************************************/ +int +scan_leb_used_cmp(struct chfs_scan_leb *sleb1, struct chfs_scan_leb *sleb2) +{ + return (sleb1->lnr - sleb2->lnr); +} + +RB_PROTOTYPE(scan_leb_used_rbtree, chfs_scan_leb, u.rb, scan_leb_used_cmp); +RB_GENERATE(scan_leb_used_rbtree, chfs_scan_leb, u.rb, scan_leb_used_cmp); + +/** + * scan_add_to_queue - adds a physical eraseblock to one of the + * eraseblock queue + * @si: chfs scanning information + * @pebnr: physical eraseblock number + * @erase_cnt: erase counter of the physical eraseblock + * @list: the list to add to + * + * This function adds a physical eraseblock to one of the lists in the scanning + * information. + * Returns zero in case of success, negative error code in case of fail. + */ +static int +scan_add_to_queue(struct chfs_scan_info *si, int pebnr, int erase_cnt, + struct scan_leb_queue *queue) +{ + struct chfs_scan_leb *sleb; + + sleb = kmem_alloc(sizeof(struct chfs_scan_leb), KM_SLEEP); + + sleb->pebnr = pebnr; + sleb->erase_cnt = erase_cnt; + TAILQ_INSERT_TAIL(queue, sleb, u.queue); + return 0; +} + +/* + * nor_scan_add_to_used - add a physical eraseblock to the + * used tree of scan info + * @ebh: chfs eraseblock handler + * @si: chfs scanning information + * @ebhdr: eraseblock header + * @pebnr: physical eraseblock number + * @leb_status: the status of the PEB's eraseblock header + * + * This function adds a PEB to the used tree of the scanning information. + * It handles the situations if there are more physical eraseblock referencing + * to the same logical eraseblock. + * Returns zero in case of success, error code in case of fail. + */ +int +nor_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si, + struct chfs_eb_hdr *ebhdr, int pebnr, int leb_status) +{ + int err, lnr, ec; + struct chfs_scan_leb *sleb, *old; + + lnr = CHFS_GET_LID(ebhdr->u.nor_hdr.lid); + ec = le32toh(ebhdr->ec_hdr.erase_cnt); + + sleb = kmem_alloc(sizeof(struct chfs_scan_leb), KM_SLEEP); + + sleb->erase_cnt = ec; + sleb->lnr = lnr; + sleb->pebnr = pebnr; + sleb->info = leb_status; + + old = RB_INSERT(scan_leb_used_rbtree, &si->used, sleb); + if (old) { + kmem_free(sleb, sizeof(struct chfs_scan_leb)); + /* There is already an eraseblock in the used tree */ + /* If the new one is bad */ + if (EBHDR_LEB_DIRTY == leb_status && + EBHDR_LEB_OK == old->info) { + return scan_add_to_queue(si, pebnr, ec, &si->erase); + } else { + err = scan_add_to_queue(si, old->pebnr, + old->erase_cnt, &si->erase); + if (err) { + return err; + } + + old->erase_cnt = ec; + old->lnr = lnr; + old->pebnr = pebnr; + old->info = leb_status; + return 0; + } + } + return 0; +} + +/** + * nor_process eb -read the headers from NOR flash, check them and add to + * the scanning information + * @ebh: chfs eraseblock handler + * @si: chfs scanning information + * @pebnr: physical eraseblock number + * + * Returns zero in case of success, error code in case of fail. + */ +int +nor_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si, + int pebnr, struct chfs_eb_hdr *ebhdr) +{ + int err, erase_cnt, leb_status; + + err = ebh->ops->read_eb_hdr(ebh, pebnr, ebhdr); + if (err) + return err; + + erase_cnt = le32toh(ebhdr->ec_hdr.erase_cnt); + dbg_ebh("erase_cnt: %d\n", erase_cnt); + leb_status = ebh->ops->check_eb_hdr(ebh, ebhdr); + if (EBHDR_LEB_BADMAGIC == leb_status || + EBHDR_LEB_BADCRC == leb_status) { + err = scan_add_to_queue(si, pebnr, erase_cnt, &si->corrupted); + return err; + } + else if (EBHDR_LEB_FREE == leb_status) { + err = scan_add_to_queue(si, pebnr, erase_cnt, &si->free); + goto count_mean; + } + else if (EBHDR_LEB_NO_HDR == leb_status) { + err = scan_add_to_queue(si, pebnr, erase_cnt, &si->erased); + return err; + } + else if (EBHDR_LEB_INVALIDATED == leb_status) { + err = scan_add_to_queue(si, pebnr, erase_cnt, &si->erase); + return err; + } + + err = nor_scan_add_to_used(ebh, si, ebhdr, pebnr, leb_status); + if (err) + return err; + + +count_mean: + si->sum_of_ec += erase_cnt; + si->num_of_eb++; + + return err; +} + +/* + * nand_scan_add_to_used - add a physical eraseblock to the + * used tree of scan info + * @ebh: chfs eraseblock handler + * @si: chfs scanning information + * @ebhdr: eraseblock header + * @pebnr: physical eraseblock number + * @leb_status: the status of the PEB's eraseblock header + * + * This function adds a PEB to the used tree of the scanning information. + * It handles the situations if there are more physical eraseblock referencing + * to the same logical eraseblock. + * Returns zero in case of success, error code in case of fail. + */ +int +nand_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si, + struct chfs_eb_hdr *ebhdr, int pebnr) +{ + int err, lnr, ec; + struct chfs_scan_leb *sleb, *old; + uint64_t serial = le64toh(ebhdr->u.nand_hdr.serial); + + lnr = CHFS_GET_LID(ebhdr->u.nor_hdr.lid); + ec = le32toh(ebhdr->ec_hdr.erase_cnt); + + sleb = kmem_alloc(sizeof(struct chfs_scan_leb), KM_SLEEP); + + sleb->erase_cnt = ec; + sleb->lnr = lnr; + sleb->pebnr = pebnr; + sleb->info = serial; + + old = RB_INSERT(scan_leb_used_rbtree, &si->used, sleb); + if (old) { + kmem_free(sleb, sizeof(struct chfs_scan_leb)); + /* There is already an eraseblock in the used tree */ + /* If the new one is bad */ + if (serial < old->info) + return scan_add_to_queue(si, pebnr, ec, &si->erase); + else { + err = scan_add_to_queue(si, + old->pebnr, old->erase_cnt, &si->erase); + if (err) + return err; + + old->erase_cnt = ec; + old->lnr = lnr; + old->pebnr = pebnr; + old->info = serial; + return 0; + } + } + return 0; +} + +/** + * nand_process eb -read the headers from NAND flash, check them and add to the + * scanning information + * @ebh: chfs eraseblock handler + * @si: chfs scanning information + * @pebnr: physical eraseblock number + * + * Returns zero in case of success, error code in case of fail. + */ +int +nand_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si, + int pebnr, struct chfs_eb_hdr *ebhdr) +{ + int err, erase_cnt, leb_status; + uint64_t max_serial; + /* isbad() is defined on some ancient platforms, heh */ + bool is_bad; + + /* Check block is bad */ + err = flash_block_isbad(ebh->flash_dev, + pebnr * ebh->flash_if->erasesize, &is_bad); + if (err) { + chfs_err("checking block is bad failed\n"); + return err; + } + if (is_bad) { + si->bad_peb_cnt++; + return 0; + } + + err = ebh->ops->read_eb_hdr(ebh, pebnr, ebhdr); + if (err) + return err; + + erase_cnt = le32toh(ebhdr->ec_hdr.erase_cnt); + leb_status = ebh->ops->check_eb_hdr(ebh, ebhdr); + if (EBHDR_LEB_BADMAGIC == leb_status || + EBHDR_LEB_BADCRC == leb_status) { + err = scan_add_to_queue(si, pebnr, erase_cnt, &si->corrupted); + return err; + } + else if (EBHDR_LEB_FREE == leb_status) { + err = scan_add_to_queue(si, pebnr, erase_cnt, &si->free); + goto count_mean; + } + else if (EBHDR_LEB_NO_HDR == leb_status) { + err = scan_add_to_queue(si, pebnr, erase_cnt, &si->erased); + return err; + } + + err = nand_scan_add_to_used(ebh, si, ebhdr, pebnr); + if (err) + return err; + + max_serial = le64toh(ebhdr->u.nand_hdr.serial); + if (max_serial > *ebh->max_serial) { + *ebh->max_serial = max_serial; + } + +count_mean: + si->sum_of_ec += erase_cnt; + si->num_of_eb++; + + return err; +} + +/** + * chfs_scan - scans the media and returns informations about it + * @ebh: chfs eraseblock handler + * + * This function scans through the media and returns information about it or if + * it fails NULL will be returned. + */ +struct chfs_scan_info * +chfs_scan(struct chfs_ebh *ebh) +{ + struct chfs_scan_info *si; + struct chfs_eb_hdr *ebhdr; + int pebnr, err; + + si = kmem_alloc(sizeof(*si), KM_SLEEP); + + TAILQ_INIT(&si->corrupted); + TAILQ_INIT(&si->free); + TAILQ_INIT(&si->erase); + TAILQ_INIT(&si->erased); + RB_INIT(&si->used); + si->bad_peb_cnt = 0; + si->num_of_eb = 0; + si->sum_of_ec = 0; + + ebhdr = kmem_alloc(sizeof(*ebhdr), KM_SLEEP); + + for (pebnr = 0; pebnr < ebh->peb_nr; pebnr++) { + dbg_ebh("processing PEB %d\n", pebnr); + err = ebh->ops->process_eb(ebh, si, pebnr, ebhdr); + if (err < 0) + goto out_ebhdr; + } + kmem_free(ebhdr, sizeof(*ebhdr)); + dbg_ebh("[CHFS_SCAN] scanning information collected\n"); + return si; + +out_ebhdr: + kmem_free(ebhdr, sizeof(*ebhdr)); + kmem_free(si, sizeof(*si)); + return NULL; +} + +/** + * scan_info_destroy - frees all lists and trees in the scanning information + * @si: the scanning information + */ +void +scan_info_destroy(struct chfs_scan_info *si) +{ + EBH_QUEUE_DESTROY(&si->corrupted, + struct chfs_scan_leb, u.queue); + + EBH_QUEUE_DESTROY(&si->erase, + struct chfs_scan_leb, u.queue); + + EBH_QUEUE_DESTROY(&si->erased, + struct chfs_scan_leb, u.queue); + + EBH_QUEUE_DESTROY(&si->free, + struct chfs_scan_leb, u.queue); + + EBH_TREE_DESTROY(scan_leb_used_rbtree, + &si->used, struct chfs_scan_leb); + + kmem_free(si, sizeof(*si)); + dbg_ebh("[SCAN_INFO_DESTROY] scanning information destroyed\n"); +} + +/** + * scan_media - scan media + * + * @ebh - chfs eraseblock handler + * + * Returns zero in case of success, error code in case of fail. + */ + +int +scan_media(struct chfs_ebh *ebh) +{ + int err, i, avg_ec; + struct chfs_scan_info *si; + struct chfs_scan_leb *sleb; + + si = chfs_scan(ebh); + /* + * Process the scan info, manage the eraseblock lists + */ + mutex_init(&ebh->ltree_lock, MUTEX_DEFAULT, IPL_NONE); + mutex_init(&ebh->erase_lock, MUTEX_DEFAULT, IPL_NONE); + RB_INIT(&ebh->ltree); + RB_INIT(&ebh->free); + RB_INIT(&ebh->in_use); + TAILQ_INIT(&ebh->to_erase); + TAILQ_INIT(&ebh->fully_erased); + mutex_init(&ebh->alc_mutex, MUTEX_DEFAULT, IPL_NONE); + + ebh->peb_nr -= si->bad_peb_cnt; + + /* + * Create background thread for erasing + */ + erase_thread_start(ebh); + + ebh->lmap = kmem_alloc(ebh->peb_nr * sizeof(int), KM_SLEEP); + + for (i = 0; i < ebh->peb_nr; i++) { + ebh->lmap[i] = EBH_LEB_UNMAPPED; + } + + if (si->num_of_eb == 0) { + /* The flash contains no data. */ + avg_ec = 0; + } + else { + avg_ec = (int) (si->sum_of_ec / si->num_of_eb); + } + dbg_ebh("num_of_eb: %d\n", si->num_of_eb); + + mutex_enter(&ebh->erase_lock); + + RB_FOREACH(sleb, scan_leb_used_rbtree, &si->used) { + ebh->lmap[sleb->lnr] = sleb->pebnr; + err = add_peb_to_in_use(ebh, sleb->pebnr, sleb->erase_cnt); + if (err) + goto out_free; + } + + TAILQ_FOREACH(sleb, &si->erased, u.queue) { + err = add_peb_to_erase_queue(ebh, sleb->pebnr, avg_ec, + &ebh->fully_erased); + if (err) + goto out_free; + } + + TAILQ_FOREACH(sleb, &si->erase, u.queue) { + err = add_peb_to_erase_queue(ebh, sleb->pebnr, avg_ec, + &ebh->to_erase); + if (err) + goto out_free; + } + + TAILQ_FOREACH(sleb, &si->free, u.queue) { + err = add_peb_to_free(ebh, sleb->pebnr, sleb->erase_cnt); + if (err) + goto out_free; + } + + TAILQ_FOREACH(sleb, &si->corrupted, u.queue) { + err = add_peb_to_erase_queue(ebh, sleb->pebnr, avg_ec, + &ebh->to_erase); + if (err) + goto out_free; + } + mutex_exit(&ebh->erase_lock); + scan_info_destroy(si); + return 0; + +out_free: + mutex_exit(&ebh->erase_lock); + kmem_free(ebh->lmap, ebh->peb_nr * sizeof(int)); + scan_info_destroy(si); + dbg_ebh("[SCAN_MEDIA] returning with error: %d\n", err); + return err; +} + +/*****************************************************************************/ +/* End of Scan related operations */ +/*****************************************************************************/ + +/** + * ebh_open - opens mtd device and init ereaseblock header + * @ebh: eraseblock handler + * @flash_nr: flash device number to use + * + * Returns zero in case of success, error code in case of fail. + */ +int +ebh_open(struct chfs_ebh *ebh, dev_t dev) +{ + int err; + + ebh->flash_dev = flash_get_device(dev); + if (!ebh->flash_dev) { + aprint_error("ebh_open: cant get flash device\n"); + return ENODEV; + } + + ebh->flash_if = flash_get_interface(dev); + if (!ebh->flash_if) { + aprint_error("ebh_open: cant get flash interface\n"); + return ENODEV; + } + + ebh->flash_size = flash_get_size(dev); + ebh->peb_nr = ebh->flash_size / ebh->flash_if->erasesize; +// ebh->peb_nr = ebh->flash_if->size / ebh->flash_if->erasesize; + /* Set up flash operations based on flash type */ + ebh->ops = kmem_alloc(sizeof(struct chfs_ebh_ops), KM_SLEEP); + + switch (ebh->flash_if->type) { + case FLASH_TYPE_NOR: + ebh->eb_size = ebh->flash_if->erasesize - + CHFS_EB_EC_HDR_SIZE - CHFS_EB_HDR_NOR_SIZE; + + ebh->ops->read_eb_hdr = nor_read_eb_hdr; + ebh->ops->write_eb_hdr = nor_write_eb_hdr; + ebh->ops->check_eb_hdr = nor_check_eb_hdr; + ebh->ops->mark_eb_hdr_dirty_flash = + nor_mark_eb_hdr_dirty_flash; + ebh->ops->invalidate_eb_hdr = nor_invalidate_eb_hdr; + ebh->ops->mark_eb_hdr_free = mark_eb_hdr_free; + + ebh->ops->process_eb = nor_process_eb; + + ebh->ops->create_eb_hdr = nor_create_eb_hdr; + ebh->ops->calc_data_offs = nor_calc_data_offs; + + ebh->max_serial = NULL; + break; + case FLASH_TYPE_NAND: + ebh->eb_size = ebh->flash_if->erasesize - + 2 * ebh->flash_if->page_size; + + ebh->ops->read_eb_hdr = nand_read_eb_hdr; + ebh->ops->write_eb_hdr = nand_write_eb_hdr; + ebh->ops->check_eb_hdr = nand_check_eb_hdr; + ebh->ops->mark_eb_hdr_free = mark_eb_hdr_free; + ebh->ops->mark_eb_hdr_dirty_flash = NULL; + ebh->ops->invalidate_eb_hdr = NULL; + + ebh->ops->process_eb = nand_process_eb; + + ebh->ops->create_eb_hdr = nand_create_eb_hdr; + ebh->ops->calc_data_offs = nand_calc_data_offs; + + ebh->max_serial = kmem_alloc(sizeof(uint64_t), KM_SLEEP); + + *ebh->max_serial = 0; + break; + default: + return 1; + } + printf("opening ebh: eb_size: %zu\n", ebh->eb_size); + err = scan_media(ebh); + if (err) { + dbg_ebh("Scan failed."); + kmem_free(ebh->ops, sizeof(struct chfs_ebh_ops)); + kmem_free(ebh, sizeof(struct chfs_ebh)); + return err; + } + return 0; +} + +/** + * ebh_close - close ebh + * @ebh: eraseblock handler + * Returns zero in case of success, error code in case of fail. + */ +int +ebh_close(struct chfs_ebh *ebh) +{ + erase_thread_stop(ebh); + + EBH_TREE_DESTROY(peb_free_rbtree, &ebh->free, struct chfs_peb); + EBH_TREE_DESTROY(peb_in_use_rbtree, &ebh->in_use, struct chfs_peb); + + EBH_QUEUE_DESTROY(&ebh->fully_erased, struct chfs_peb, u.queue); + EBH_QUEUE_DESTROY(&ebh->to_erase, struct chfs_peb, u.queue); + + /* XXX HACK, see ebh.h */ + EBH_TREE_DESTROY_MUTEX(ltree_rbtree, &ebh->ltree, + struct chfs_ltree_entry); + + KASSERT(!mutex_owned(&ebh->ltree_lock)); + KASSERT(!mutex_owned(&ebh->alc_mutex)); + KASSERT(!mutex_owned(&ebh->erase_lock)); + + mutex_destroy(&ebh->ltree_lock); + mutex_destroy(&ebh->alc_mutex); + mutex_destroy(&ebh->erase_lock); + + kmem_free(ebh->ops, sizeof(struct chfs_ebh_ops)); + kmem_free(ebh, sizeof(struct chfs_ebh)); + + return 0; +} + +/** + * ebh_read_leb - read data from leb + * @ebh: eraseblock handler + * @lnr: logical eraseblock number + * @buf: buffer to read to + * @offset: offset from where to read + * @len: bytes number to read + * + * Returns zero in case of success, error code in case of fail. + */ +int +ebh_read_leb(struct chfs_ebh *ebh, int lnr, char *buf, uint32_t offset, + size_t len, size_t *retlen) +{ + int err, pebnr; + off_t data_offset; + + KASSERT(offset + len <= ebh->eb_size); + + err = leb_read_lock(ebh, lnr); + if (err) + return err; + pebnr = ebh->lmap[lnr]; + /* If PEB is not mapped the buffer is filled with 0xFF */ + if (EBH_LEB_UNMAPPED == pebnr) { + leb_read_unlock(ebh, lnr); + memset(buf, 0xFF, len); + return 0; + } + + /* Read data */ + data_offset = ebh->ops->calc_data_offs(ebh, pebnr, offset); + err = flash_read(ebh->flash_dev, data_offset, len, retlen, + (unsigned char *) buf); + if (err) + goto out_free; + + KASSERT(len == *retlen); + + leb_read_unlock(ebh, lnr); + return err; + +out_free: + leb_read_unlock(ebh, lnr); + return err; +} + +/** + * get_peb: get a free physical eraseblock + * @ebh - chfs eraseblock handler + * + * This function gets a free eraseblock from the ebh->free RB-tree. + * The fist entry will be returned and deleted from the tree. + * The entries sorted by the erase counters, so the PEB with the smallest + * erase counter will be added back. + * If something goes bad a negative value will be returned. + */ +int +get_peb(struct chfs_ebh *ebh) +{ + int err, pebnr; + struct chfs_peb *peb; + +retry: + mutex_enter(&ebh->erase_lock); + //dbg_ebh("LOCK: ebh->erase_lock spin locked in get_peb()\n"); + if (RB_EMPTY(&ebh->free)) { + /*There is no more free PEBs in the tree*/ + if (TAILQ_EMPTY(&ebh->to_erase) && + TAILQ_EMPTY(&ebh->fully_erased)) { + mutex_exit(&ebh->erase_lock); + //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in get_peb()\n"); + return ENOSPC; + } + err = free_peb(ebh); + + mutex_exit(&ebh->erase_lock); + //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in get_peb()\n"); + + if (err) + return err; + goto retry; + } + peb = RB_MIN(peb_free_rbtree, &ebh->free); + pebnr = peb->pebnr; + RB_REMOVE(peb_free_rbtree, &ebh->free, peb); + err = add_peb_to_in_use(ebh, peb->pebnr, peb->erase_cnt); + if (err) + pebnr = err; + + kmem_free(peb, sizeof(struct chfs_peb)); + + mutex_exit(&ebh->erase_lock); + //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in get_peb()\n"); + + return pebnr; +} + +/** + * ebh_write_leb - write data to leb + * @ebh: eraseblock handler + * @lnr: logical eraseblock number + * @buf: data to write + * @offset: offset where to write + * @len: bytes number to write + * + * Returns zero in case of success, error code in case of fail. + */ +int +ebh_write_leb(struct chfs_ebh *ebh, int lnr, char *buf, uint32_t offset, + size_t len, size_t *retlen) +{ + int err, pebnr, retries = 0; + off_t data_offset; + struct chfs_eb_hdr *ebhdr; + + dbg("offset: %d | len: %zu | (offset+len): %zu " + " | ebsize: %zu\n", offset, len, (offset+len), ebh->eb_size); + + KASSERT(offset + len <= ebh->eb_size); + + err = leb_write_lock(ebh, lnr); + if (err) + return err; + + pebnr = ebh->lmap[lnr]; + /* If the LEB is mapped write out data */ + if (pebnr != EBH_LEB_UNMAPPED) { + data_offset = ebh->ops->calc_data_offs(ebh, pebnr, offset); + err = flash_write(ebh->flash_dev, data_offset, len, retlen, + (unsigned char *) buf); + + if (err) { + chfs_err("error %d while writing %zu bytes to PEB " + "%d:%ju, written %zu bytes\n", + err, len, pebnr, (uintmax_t )offset, *retlen); + } else { + KASSERT(len == *retlen); + } + + leb_write_unlock(ebh, lnr); + return err; + } + + /* + * If the LEB is unmapped, get a free PEB and write the + * eraseblock header first + */ + ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP); + + /* Setting up eraseblock header properties */ + ebh->ops->create_eb_hdr(ebhdr, lnr); + +retry: + /* Getting a physical eraseblock from the wear leveling system */ + pebnr = get_peb(ebh); + if (pebnr < 0) { + leb_write_unlock(ebh, lnr); + kmem_free(ebhdr, sizeof(struct chfs_eb_hdr)); + return pebnr; + } + + /* Write the eraseblock header to the media */ + err = ebh->ops->write_eb_hdr(ebh, pebnr, ebhdr); + if (err) { + chfs_warn( + "error writing eraseblock header: LEB %d , PEB %d\n", + lnr, pebnr); + goto write_error; + } + + /* Write out data */ + if (len) { + data_offset = ebh->ops->calc_data_offs(ebh, pebnr, offset); + err = flash_write(ebh->flash_dev, + data_offset, len, retlen, (unsigned char *) buf); + if (err) { + chfs_err("error %d while writing %zu bytes to PEB " + " %d:%ju, written %zu bytes\n", + err, len, pebnr, (uintmax_t )offset, *retlen); + goto write_error; + } + } + + ebh->lmap[lnr] = pebnr; + leb_write_unlock(ebh, lnr); + kmem_free(ebhdr, sizeof(struct chfs_eb_hdr)); + + return 0; + +write_error: err = release_peb(ebh, pebnr); + // max retries (NOW: 2) + if (err || CHFS_MAX_GET_PEB_RETRIES < ++retries) { + leb_write_unlock(ebh, lnr); + kmem_free(ebhdr, sizeof(struct chfs_eb_hdr)); + return err; + } + goto retry; +} + +/** + * ebh_erase_leb - erase a leb + * @ebh: eraseblock handler + * @lnr: leb number + * + * Returns zero in case of success, error code in case of fail. + */ +int +ebh_erase_leb(struct chfs_ebh *ebh, int lnr) +{ + int err, pebnr; + + leb_write_lock(ebh, lnr); + + pebnr = ebh->lmap[lnr]; + if (pebnr < 0) { + leb_write_unlock(ebh, lnr); + return EBH_LEB_UNMAPPED; + } + err = release_peb(ebh, pebnr); + if (err) + goto out_unlock; + + ebh->lmap[lnr] = EBH_LEB_UNMAPPED; + cv_signal(&ebh->bg_erase.eth_wakeup); +out_unlock: + leb_write_unlock(ebh, lnr); + return err; +} + +/** + * ebh_map_leb - maps a PEB to LEB + * @ebh: eraseblock handler + * @lnr: leb number + * + * Returns zero on success, error code in case of fail + */ +int +ebh_map_leb(struct chfs_ebh *ebh, int lnr) +{ + int err, pebnr, retries = 0; + struct chfs_eb_hdr *ebhdr; + + ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP); + + err = leb_write_lock(ebh, lnr); + if (err) + return err; + +retry: + pebnr = get_peb(ebh); + if (pebnr < 0) { + err = pebnr; + goto out_unlock; + } + + ebh->ops->create_eb_hdr(ebhdr, lnr); + + err = ebh->ops->write_eb_hdr(ebh, pebnr, ebhdr); + if (err) { + chfs_warn( + "error writing eraseblock header: LEB %d , PEB %d\n", + lnr, pebnr); + goto write_error; + } + + ebh->lmap[lnr] = pebnr; + +out_unlock: + leb_write_unlock(ebh, lnr); + return err; + +write_error: + err = release_peb(ebh, pebnr); + // max retries (NOW: 2) + if (err || CHFS_MAX_GET_PEB_RETRIES < ++retries) { + leb_write_unlock(ebh, lnr); + kmem_free(ebhdr, sizeof(struct chfs_eb_hdr)); + return err; + } + goto retry; +} + +/** + * ebh_unmap_leb - + * @ebh: eraseblock handler + * @lnr: leb number + * + * Retruns zero on success, error code in case of fail. + */ +int +ebh_unmap_leb(struct chfs_ebh *ebh, int lnr) +{ + int err; + + if (ebh_is_mapped(ebh, lnr) < 0) + /* If the eraseblock already unmapped */ + return 0; + + err = ebh_erase_leb(ebh, lnr); + + return err; +} + +/** + * ebh_is_mapped - check if a PEB is mapped to @lnr + * @ebh: eraseblock handler + * @lnr: leb number + * + * Retruns 0 if the logical eraseblock is mapped, negative error code otherwise. + */ +int +ebh_is_mapped(struct chfs_ebh *ebh, int lnr) +{ + int err, result; + err = leb_read_lock(ebh, lnr); + if (err) + return err; + + result = ebh->lmap[lnr]; + leb_read_unlock(ebh, lnr); + + return result; +} + +/** + * ebh_change_leb - write the LEB to another PEB + * @ebh: eraseblock handler + * @lnr: leb number + * @buf: data to write + * @len: length of data + * Returns zero in case of success, error code in case of fail. + */ +int +ebh_change_leb(struct chfs_ebh *ebh, int lnr, char *buf, size_t len, + size_t *retlen) +{ + int err, pebnr, pebnr_old, retries = 0; + off_t data_offset; + + struct chfs_peb *peb = NULL; + struct chfs_eb_hdr *ebhdr; + + if (ebh_is_mapped(ebh, lnr) < 0) + return EBH_LEB_UNMAPPED; + + if (len == 0) { + err = ebh_unmap_leb(ebh, lnr); + if (err) + return err; + return ebh_map_leb(ebh, lnr); + } + + ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP); + + pebnr_old = ebh->lmap[lnr]; + + mutex_enter(&ebh->alc_mutex); + err = leb_write_lock(ebh, lnr); + if (err) + goto out_mutex; + + if (ebh->ops->mark_eb_hdr_dirty_flash) { + err = ebh->ops->mark_eb_hdr_dirty_flash(ebh, pebnr_old, lnr); + if (err) + goto out_unlock; + } + + /* Setting up eraseblock header properties */ + ebh->ops->create_eb_hdr(ebhdr, lnr); + +retry: + /* Getting a physical eraseblock from the wear leveling system */ + pebnr = get_peb(ebh); + if (pebnr < 0) { + leb_write_unlock(ebh, lnr); + mutex_exit(&ebh->alc_mutex); + kmem_free(ebhdr, sizeof(struct chfs_eb_hdr)); + return pebnr; + } + + err = ebh->ops->write_eb_hdr(ebh, pebnr, ebhdr); + if (err) { + chfs_warn( + "error writing eraseblock header: LEB %d , PEB %d", + lnr, pebnr); + goto write_error; + } + + /* Write out data */ + data_offset = ebh->ops->calc_data_offs(ebh, pebnr, 0); + err = flash_write(ebh->flash_dev, data_offset, len, retlen, + (unsigned char *) buf); + if (err) { + chfs_err("error %d while writing %zu bytes to PEB %d:%ju," + " written %zu bytes", + err, len, pebnr, (uintmax_t)data_offset, *retlen); + goto write_error; + } + + ebh->lmap[lnr] = pebnr; + + if (ebh->ops->invalidate_eb_hdr) { + err = ebh->ops->invalidate_eb_hdr(ebh, pebnr_old); + if (err) + goto out_unlock; + } + peb = find_peb_in_use(ebh, pebnr_old); + err = release_peb(ebh, peb->pebnr); + +out_unlock: + leb_write_unlock(ebh, lnr); + +out_mutex: + mutex_exit(&ebh->alc_mutex); + kmem_free(ebhdr, sizeof(struct chfs_eb_hdr)); + kmem_free(peb, sizeof(struct chfs_peb)); + return err; + +write_error: + err = release_peb(ebh, pebnr); + //max retries (NOW: 2) + if (err || CHFS_MAX_GET_PEB_RETRIES < ++retries) { + leb_write_unlock(ebh, lnr); + mutex_exit(&ebh->alc_mutex); + kmem_free(ebhdr, sizeof(struct chfs_eb_hdr)); + return err; + } + goto retry; +} + diff --git a/include/ufs/chfs/ebh.h b/sys/ufs/chfs/ebh.h similarity index 100% rename from include/ufs/chfs/ebh.h rename to sys/ufs/chfs/ebh.h diff --git a/include/ufs/chfs/ebh_media.h b/sys/ufs/chfs/ebh_media.h similarity index 100% rename from include/ufs/chfs/ebh_media.h rename to sys/ufs/chfs/ebh_media.h diff --git a/include/ufs/chfs/ebh_misc.h b/sys/ufs/chfs/ebh_misc.h similarity index 100% rename from include/ufs/chfs/ebh_misc.h rename to sys/ufs/chfs/ebh_misc.h diff --git a/include/ufs/chfs/media.h b/sys/ufs/chfs/media.h similarity index 100% rename from include/ufs/chfs/media.h rename to sys/ufs/chfs/media.h diff --git a/sys/ufs/ext2fs/Makefile b/sys/ufs/ext2fs/Makefile new file mode 100644 index 000000000..a3df42f59 --- /dev/null +++ b/sys/ufs/ext2fs/Makefile @@ -0,0 +1,7 @@ +# $NetBSD: Makefile,v 1.1 1998/06/12 23:23:11 cgd Exp $ + +INCSDIR= /usr/include/ufs/ext2fs + +INCS= ext2fs.h ext2fs_dinode.h ext2fs_dir.h ext2fs_extern.h + +.include diff --git a/include/ufs/ext2fs/ext2fs.h b/sys/ufs/ext2fs/ext2fs.h similarity index 100% rename from include/ufs/ext2fs/ext2fs.h rename to sys/ufs/ext2fs/ext2fs.h diff --git a/sys/ufs/ext2fs/ext2fs_alloc.c b/sys/ufs/ext2fs/ext2fs_alloc.c new file mode 100644 index 000000000..9c2b4cf40 --- /dev/null +++ b/sys/ufs/ext2fs/ext2fs_alloc.c @@ -0,0 +1,637 @@ +/* $NetBSD: ext2fs_alloc.c,v 1.42 2011/03/06 04:46:26 rmind Exp $ */ + +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_alloc.c 8.11 (Berkeley) 10/27/94 + * Modified for ext2fs by Manuel Bouyer. + */ + +/* + * Copyright (c) 1997 Manuel Bouyer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @(#)ffs_alloc.c 8.11 (Berkeley) 10/27/94 + * Modified for ext2fs by Manuel Bouyer. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ext2fs_alloc.c,v 1.42 2011/03/06 04:46:26 rmind Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +u_long ext2gennumber; + +static daddr_t ext2fs_alloccg(struct inode *, int, daddr_t, int); +static u_long ext2fs_dirpref(struct m_ext2fs *); +static void ext2fs_fserr(struct m_ext2fs *, u_int, const char *); +static u_long ext2fs_hashalloc(struct inode *, int, long, int, + daddr_t (*)(struct inode *, int, daddr_t, int)); +static daddr_t ext2fs_nodealloccg(struct inode *, int, daddr_t, int); +static daddr_t ext2fs_mapsearch(struct m_ext2fs *, char *, daddr_t); + +/* + * Allocate a block in the file system. + * + * A preference may be optionally specified. If a preference is given + * the following hierarchy is used to allocate a block: + * 1) allocate the requested block. + * 2) allocate a rotationally optimal block in the same cylinder. + * 3) allocate a block in the same cylinder group. + * 4) quadradically rehash into other cylinder groups, until an + * available block is located. + * If no block preference is given the following hierarchy is used + * to allocate a block: + * 1) allocate a block in the cylinder group that contains the + * inode for the file. + * 2) quadradically rehash into other cylinder groups, until an + * available block is located. + */ +int +ext2fs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, + kauth_cred_t cred, daddr_t *bnp) +{ + struct m_ext2fs *fs; + daddr_t bno; + int cg; + + *bnp = 0; + fs = ip->i_e2fs; +#ifdef DIAGNOSTIC + if (cred == NOCRED) + panic("ext2fs_alloc: missing credential"); +#endif /* DIAGNOSTIC */ + if (fs->e2fs.e2fs_fbcount == 0) + goto nospace; + if (kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL, + NULL, NULL) != 0 && + freespace(fs) <= 0) + goto nospace; + if (bpref >= fs->e2fs.e2fs_bcount) + bpref = 0; + if (bpref == 0) + cg = ino_to_cg(fs, ip->i_number); + else + cg = dtog(fs, bpref); + bno = (daddr_t)ext2fs_hashalloc(ip, cg, bpref, fs->e2fs_bsize, + ext2fs_alloccg); + if (bno > 0) { + ip->i_e2fs_nblock += btodb(fs->e2fs_bsize); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + *bnp = bno; + return (0); + } +nospace: + ext2fs_fserr(fs, kauth_cred_geteuid(cred), "file system full"); + uprintf("\n%s: write failed, file system is full\n", fs->e2fs_fsmnt); + return (ENOSPC); +} + +/* + * Allocate an inode in the file system. + * + * If allocating a directory, use ext2fs_dirpref to select the inode. + * If allocating in a directory, the following hierarchy is followed: + * 1) allocate the preferred inode. + * 2) allocate an inode in the same cylinder group. + * 3) quadradically rehash into other cylinder groups, until an + * available inode is located. + * If no inode preference is given the following hierarchy is used + * to allocate an inode: + * 1) allocate an inode in cylinder group 0. + * 2) quadradically rehash into other cylinder groups, until an + * available inode is located. + */ +int +ext2fs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, + struct vnode **vpp) +{ + struct inode *pip; + struct m_ext2fs *fs; + struct inode *ip; + ino_t ino, ipref; + int cg, error; + + *vpp = NULL; + pip = VTOI(pvp); + fs = pip->i_e2fs; + if (fs->e2fs.e2fs_ficount == 0) + goto noinodes; + + if ((mode & IFMT) == IFDIR) + cg = ext2fs_dirpref(fs); + else + cg = ino_to_cg(fs, pip->i_number); + ipref = cg * fs->e2fs.e2fs_ipg + 1; + ino = (ino_t)ext2fs_hashalloc(pip, cg, (long)ipref, mode, ext2fs_nodealloccg); + if (ino == 0) + goto noinodes; + error = VFS_VGET(pvp->v_mount, ino, vpp); + if (error) { + ext2fs_vfree(pvp, ino, mode); + return (error); + } + ip = VTOI(*vpp); + if (ip->i_e2fs_mode && ip->i_e2fs_nlink != 0) { + printf("mode = 0%o, nlinks %d, inum = %llu, fs = %s\n", + ip->i_e2fs_mode, ip->i_e2fs_nlink, + (unsigned long long)ip->i_number, fs->e2fs_fsmnt); + panic("ext2fs_valloc: dup alloc"); + } + + memset(ip->i_din.e2fs_din, 0, sizeof(struct ext2fs_dinode)); + + /* + * Set up a new generation number for this inode. + */ + if (++ext2gennumber < time_second) + ext2gennumber = time_second; + ip->i_e2fs_gen = ext2gennumber; + return (0); +noinodes: + ext2fs_fserr(fs, kauth_cred_geteuid(cred), "out of inodes"); + uprintf("\n%s: create/symlink failed, no inodes free\n", fs->e2fs_fsmnt); + return (ENOSPC); +} + +/* + * Find a cylinder to place a directory. + * + * The policy implemented by this algorithm is to select from + * among those cylinder groups with above the average number of + * free inodes, the one with the smallest number of directories. + */ +static u_long +ext2fs_dirpref(struct m_ext2fs *fs) +{ + int cg, maxspace, mincg, avgifree; + + avgifree = fs->e2fs.e2fs_ficount / fs->e2fs_ncg; + maxspace = 0; + mincg = -1; + for (cg = 0; cg < fs->e2fs_ncg; cg++) + if ( fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree) { + if (mincg == -1 || fs->e2fs_gd[cg].ext2bgd_nbfree > maxspace) { + mincg = cg; + maxspace = fs->e2fs_gd[cg].ext2bgd_nbfree; + } + } + return mincg; +} + +/* + * Select the desired position for the next block in a file. The file is + * logically divided into sections. The first section is composed of the + * direct blocks. Each additional section contains fs_maxbpg blocks. + * + * If no blocks have been allocated in the first section, the policy is to + * request a block in the same cylinder group as the inode that describes + * the file. Otherwise, the policy is to try to allocate the blocks + * contigously. The two fields of the ext2 inode extension (see + * ufs/ufs/inode.h) help this. + */ +daddr_t +ext2fs_blkpref(struct inode *ip, daddr_t lbn, int indx, + int32_t *bap /* XXX ondisk32 */) +{ + struct m_ext2fs *fs; + int cg, i; + + fs = ip->i_e2fs; + /* + * if we are doing contigous lbn allocation, try to alloc blocks + * contigously on disk + */ + + if ( ip->i_e2fs_last_blk && lbn == ip->i_e2fs_last_lblk + 1) { + return ip->i_e2fs_last_blk + 1; + } + + /* + * bap, if provided, gives us a list of blocks to which we want to + * stay close + */ + + if (bap) { + for (i = indx; i >= 0 ; i--) { + if (bap[i]) { + return fs2h32(bap[i]) + 1; + } + } + } + + /* fall back to the first block of the cylinder containing the inode */ + + cg = ino_to_cg(fs, ip->i_number); + return fs->e2fs.e2fs_bpg * cg + fs->e2fs.e2fs_first_dblock + 1; +} + +/* + * Implement the cylinder overflow algorithm. + * + * The policy implemented by this algorithm is: + * 1) allocate the block in its requested cylinder group. + * 2) quadradically rehash on the cylinder group number. + * 3) brute force search for a free block. + */ +static u_long +ext2fs_hashalloc(struct inode *ip, int cg, long pref, int size, + daddr_t (*allocator)(struct inode *, int, daddr_t, int)) +{ + struct m_ext2fs *fs; + long result; + int i, icg = cg; + + fs = ip->i_e2fs; + /* + * 1: preferred cylinder group + */ + result = (*allocator)(ip, cg, pref, size); + if (result) + return (result); + /* + * 2: quadratic rehash + */ + for (i = 1; i < fs->e2fs_ncg; i *= 2) { + cg += i; + if (cg >= fs->e2fs_ncg) + cg -= fs->e2fs_ncg; + result = (*allocator)(ip, cg, 0, size); + if (result) + return (result); + } + /* + * 3: brute force search + * Note that we start at i == 2, since 0 was checked initially, + * and 1 is always checked in the quadratic rehash. + */ + cg = (icg + 2) % fs->e2fs_ncg; + for (i = 2; i < fs->e2fs_ncg; i++) { + result = (*allocator)(ip, cg, 0, size); + if (result) + return (result); + cg++; + if (cg == fs->e2fs_ncg) + cg = 0; + } + return (0); +} + +/* + * Determine whether a block can be allocated. + * + * Check to see if a block of the appropriate size is available, + * and if it is, allocate it. + */ + +static daddr_t +ext2fs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size) +{ + struct m_ext2fs *fs; + char *bbp; + struct buf *bp; + /* XXX ondisk32 */ + int error, bno, start, end, loc; + + fs = ip->i_e2fs; + if (fs->e2fs_gd[cg].ext2bgd_nbfree == 0) + return (0); + error = bread(ip->i_devvp, fsbtodb(fs, + fs->e2fs_gd[cg].ext2bgd_b_bitmap), + (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + return (0); + } + bbp = (char *)bp->b_data; + + if (dtog(fs, bpref) != cg) + bpref = 0; + if (bpref != 0) { + bpref = dtogd(fs, bpref); + /* + * if the requested block is available, use it + */ + if (isclr(bbp, bpref)) { + bno = bpref; + goto gotit; + } + } + /* + * no blocks in the requested cylinder, so take next + * available one in this cylinder group. + * first try to get 8 contigous blocks, then fall back to a single + * block. + */ + if (bpref) + start = dtogd(fs, bpref) / NBBY; + else + start = 0; + end = howmany(fs->e2fs.e2fs_fpg, NBBY) - start; + for (loc = start; loc < end; loc++) { + if (bbp[loc] == 0) { + bno = loc * NBBY; + goto gotit; + } + } + for (loc = 0; loc < start; loc++) { + if (bbp[loc] == 0) { + bno = loc * NBBY; + goto gotit; + } + } + + bno = ext2fs_mapsearch(fs, bbp, bpref); + if (bno < 0) + return (0); +gotit: +#ifdef DIAGNOSTIC + if (isset(bbp, (daddr_t)bno)) { + printf("ext2fs_alloccgblk: cg=%d bno=%d fs=%s\n", + cg, bno, fs->e2fs_fsmnt); + panic("ext2fs_alloccg: dup alloc"); + } +#endif + setbit(bbp, (daddr_t)bno); + fs->e2fs.e2fs_fbcount--; + fs->e2fs_gd[cg].ext2bgd_nbfree--; + fs->e2fs_fmod = 1; + bdwrite(bp); + return (cg * fs->e2fs.e2fs_fpg + fs->e2fs.e2fs_first_dblock + bno); +} + +/* + * Determine whether an inode can be allocated. + * + * Check to see if an inode is available, and if it is, + * allocate it using the following policy: + * 1) allocate the requested inode. + * 2) allocate the next available inode after the requested + * inode in the specified cylinder group. + */ +static daddr_t +ext2fs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode) +{ + struct m_ext2fs *fs; + char *ibp; + struct buf *bp; + int error, start, len, loc, map, i; + + ipref--; /* to avoid a lot of (ipref -1) */ + if (ipref == -1) + ipref = 0; + fs = ip->i_e2fs; + if (fs->e2fs_gd[cg].ext2bgd_nifree == 0) + return (0); + error = bread(ip->i_devvp, fsbtodb(fs, + fs->e2fs_gd[cg].ext2bgd_i_bitmap), + (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + return (0); + } + ibp = (char *)bp->b_data; + if (ipref) { + ipref %= fs->e2fs.e2fs_ipg; + if (isclr(ibp, ipref)) + goto gotit; + } + start = ipref / NBBY; + len = howmany(fs->e2fs.e2fs_ipg - ipref, NBBY); + loc = skpc(0xff, len, &ibp[start]); + if (loc == 0) { + len = start + 1; + start = 0; + loc = skpc(0xff, len, &ibp[0]); + if (loc == 0) { + printf("cg = %d, ipref = %lld, fs = %s\n", + cg, (long long)ipref, fs->e2fs_fsmnt); + panic("ext2fs_nodealloccg: map corrupted"); + /* NOTREACHED */ + } + } + i = start + len - loc; + map = ibp[i] ^ 0xff; + if (map == 0) { + printf("fs = %s\n", fs->e2fs_fsmnt); + panic("ext2fs_nodealloccg: block not in map"); + } + ipref = i * NBBY + ffs(map) - 1; +gotit: + setbit(ibp, ipref); + fs->e2fs.e2fs_ficount--; + fs->e2fs_gd[cg].ext2bgd_nifree--; + fs->e2fs_fmod = 1; + if ((mode & IFMT) == IFDIR) { + fs->e2fs_gd[cg].ext2bgd_ndirs++; + } + bdwrite(bp); + return (cg * fs->e2fs.e2fs_ipg + ipref +1); +} + +/* + * Free a block. + * + * The specified block is placed back in the + * free map. + */ +void +ext2fs_blkfree(struct inode *ip, daddr_t bno) +{ + struct m_ext2fs *fs; + char *bbp; + struct buf *bp; + int error, cg; + + fs = ip->i_e2fs; + cg = dtog(fs, bno); + if ((u_int)bno >= fs->e2fs.e2fs_bcount) { + printf("bad block %lld, ino %llu\n", (long long)bno, + (unsigned long long)ip->i_number); + ext2fs_fserr(fs, ip->i_uid, "bad block"); + return; + } + error = bread(ip->i_devvp, + fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_b_bitmap), + (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + return; + } + bbp = (char *)bp->b_data; + bno = dtogd(fs, bno); + if (isclr(bbp, bno)) { + printf("dev = 0x%llx, block = %lld, fs = %s\n", + (unsigned long long)ip->i_dev, (long long)bno, + fs->e2fs_fsmnt); + panic("blkfree: freeing free block"); + } + clrbit(bbp, bno); + fs->e2fs.e2fs_fbcount++; + fs->e2fs_gd[cg].ext2bgd_nbfree++; + + fs->e2fs_fmod = 1; + bdwrite(bp); +} + +/* + * Free an inode. + * + * The specified inode is placed back in the free map. + */ +int +ext2fs_vfree(struct vnode *pvp, ino_t ino, int mode) +{ + struct m_ext2fs *fs; + char *ibp; + struct inode *pip; + struct buf *bp; + int error, cg; + + pip = VTOI(pvp); + fs = pip->i_e2fs; + if ((u_int)ino > fs->e2fs.e2fs_icount || (u_int)ino < EXT2_FIRSTINO) + panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s", + (unsigned long long)pip->i_dev, (unsigned long long)ino, + fs->e2fs_fsmnt); + cg = ino_to_cg(fs, ino); + error = bread(pip->i_devvp, + fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_i_bitmap), + (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + return (0); + } + ibp = (char *)bp->b_data; + ino = (ino - 1) % fs->e2fs.e2fs_ipg; + if (isclr(ibp, ino)) { + printf("dev = 0x%llx, ino = %llu, fs = %s\n", + (unsigned long long)pip->i_dev, + (unsigned long long)ino, fs->e2fs_fsmnt); + if (fs->e2fs_ronly == 0) + panic("ifree: freeing free inode"); + } + clrbit(ibp, ino); + fs->e2fs.e2fs_ficount++; + fs->e2fs_gd[cg].ext2bgd_nifree++; + if ((mode & IFMT) == IFDIR) { + fs->e2fs_gd[cg].ext2bgd_ndirs--; + } + fs->e2fs_fmod = 1; + bdwrite(bp); + return (0); +} + +/* + * Find a block in the specified cylinder group. + * + * It is a panic if a request is made to find a block if none are + * available. + */ + +static daddr_t +ext2fs_mapsearch(struct m_ext2fs *fs, char *bbp, daddr_t bpref) +{ + int start, len, loc, i, map; + + /* + * find the fragment by searching through the free block + * map for an appropriate bit pattern + */ + if (bpref) + start = dtogd(fs, bpref) / NBBY; + else + start = 0; + len = howmany(fs->e2fs.e2fs_fpg, NBBY) - start; + loc = skpc(0xff, len, &bbp[start]); + if (loc == 0) { + len = start + 1; + start = 0; + loc = skpc(0xff, len, &bbp[start]); + if (loc == 0) { + printf("start = %d, len = %d, fs = %s\n", + start, len, fs->e2fs_fsmnt); + panic("ext2fs_alloccg: map corrupted"); + /* NOTREACHED */ + } + } + i = start + len - loc; + map = bbp[i] ^ 0xff; + if (map == 0) { + printf("fs = %s\n", fs->e2fs_fsmnt); + panic("ext2fs_mapsearch: block not in map"); + } + return i * NBBY + ffs(map) - 1; +} + +/* + * Fserr prints the name of a file system with an error diagnostic. + * + * The form of the error message is: + * fs: error message + */ +static void +ext2fs_fserr(struct m_ext2fs *fs, u_int uid, const char *cp) +{ + + log(LOG_ERR, "uid %d on %s: %s\n", uid, fs->e2fs_fsmnt, cp); +} diff --git a/sys/ufs/ext2fs/ext2fs_balloc.c b/sys/ufs/ext2fs/ext2fs_balloc.c new file mode 100644 index 000000000..6564bf905 --- /dev/null +++ b/sys/ufs/ext2fs/ext2fs_balloc.c @@ -0,0 +1,403 @@ +/* $NetBSD: ext2fs_balloc.c,v 1.34 2009/10/19 18:41:17 bouyer Exp $ */ + +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_balloc.c 8.4 (Berkeley) 9/23/93 + * Modified for ext2fs by Manuel Bouyer. + */ + +/* + * Copyright (c) 1997 Manuel Bouyer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @(#)ffs_balloc.c 8.4 (Berkeley) 9/23/93 + * Modified for ext2fs by Manuel Bouyer. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ext2fs_balloc.c,v 1.34 2009/10/19 18:41:17 bouyer Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_uvmhist.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +/* + * Balloc defines the structure of file system storage + * by allocating the physical blocks on a device given + * the inode and the logical block number in a file. + */ +int +ext2fs_balloc(struct inode *ip, daddr_t bn, int size, + kauth_cred_t cred, struct buf **bpp, int flags) +{ + struct m_ext2fs *fs; + daddr_t nb; + struct buf *bp, *nbp; + struct vnode *vp = ITOV(ip); + struct indir indirs[NIADDR + 2]; + daddr_t newb, lbn, pref; + int32_t *bap; /* XXX ondisk32 */ + int num, i, error; + u_int deallocated; + daddr_t *blkp, *allocblk, allociblk[NIADDR + 1]; + int32_t *allocib; /* XXX ondisk32 */ + int unwindidx = -1; + UVMHIST_FUNC("ext2fs_balloc"); UVMHIST_CALLED(ubchist); + + UVMHIST_LOG(ubchist, "bn 0x%x", bn,0,0,0); + + if (bpp != NULL) { + *bpp = NULL; + } + if (bn < 0) + return (EFBIG); + fs = ip->i_e2fs; + lbn = bn; + + /* + * The first NDADDR blocks are direct blocks + */ + if (bn < NDADDR) { + /* XXX ondisk32 */ + nb = fs2h32(ip->i_e2fs_blocks[bn]); + if (nb != 0) { + + /* + * the block is already allocated, just read it. + */ + + if (bpp != NULL) { + error = bread(vp, bn, fs->e2fs_bsize, NOCRED, + B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + return (error); + } + *bpp = bp; + } + return (0); + } + + /* + * allocate a new direct block. + */ + + error = ext2fs_alloc(ip, bn, + ext2fs_blkpref(ip, bn, bn, &ip->i_e2fs_blocks[0]), + cred, &newb); + if (error) + return (error); + ip->i_e2fs_last_lblk = lbn; + ip->i_e2fs_last_blk = newb; + /* XXX ondisk32 */ + ip->i_e2fs_blocks[bn] = h2fs32((int32_t)newb); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (bpp != NULL) { + bp = getblk(vp, bn, fs->e2fs_bsize, 0, 0); + bp->b_blkno = fsbtodb(fs, newb); + if (flags & B_CLRBUF) + clrbuf(bp); + *bpp = bp; + } + return (0); + } + /* + * Determine the number of levels of indirection. + */ + pref = 0; + if ((error = ufs_getlbns(vp, bn, indirs, &num)) != 0) + return(error); +#ifdef DIAGNOSTIC + if (num < 1) + panic ("ext2fs_balloc: ufs_getlbns returned indirect block\n"); +#endif + /* + * Fetch the first indirect block allocating if necessary. + */ + --num; + /* XXX ondisk32 */ + nb = fs2h32(ip->i_e2fs_blocks[NDADDR + indirs[0].in_off]); + allocib = NULL; + allocblk = allociblk; + if (nb == 0) { + pref = ext2fs_blkpref(ip, lbn, 0, (int32_t *)0); + error = ext2fs_alloc(ip, lbn, pref, cred, &newb); + if (error) + return (error); + nb = newb; + *allocblk++ = nb; + ip->i_e2fs_last_blk = newb; + bp = getblk(vp, indirs[1].in_lbn, fs->e2fs_bsize, 0, 0); + bp->b_blkno = fsbtodb(fs, newb); + clrbuf(bp); + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if ((error = bwrite(bp)) != 0) + goto fail; + unwindidx = 0; + allocib = &ip->i_e2fs_blocks[NDADDR + indirs[0].in_off]; + /* XXX ondisk32 */ + *allocib = h2fs32((int32_t)newb); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + /* + * Fetch through the indirect blocks, allocating as necessary. + */ + for (i = 1;;) { + error = bread(vp, + indirs[i].in_lbn, (int)fs->e2fs_bsize, NOCRED, 0, &bp); + if (error) { + brelse(bp, 0); + goto fail; + } + bap = (int32_t *)bp->b_data; /* XXX ondisk32 */ + nb = fs2h32(bap[indirs[i].in_off]); + if (i == num) + break; + i++; + if (nb != 0) { + brelse(bp, 0); + continue; + } + pref = ext2fs_blkpref(ip, lbn, 0, (int32_t *)0); + error = ext2fs_alloc(ip, lbn, pref, cred, &newb); + if (error) { + brelse(bp, 0); + goto fail; + } + nb = newb; + *allocblk++ = nb; + ip->i_e2fs_last_blk = newb; + nbp = getblk(vp, indirs[i].in_lbn, fs->e2fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); + clrbuf(nbp); + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if ((error = bwrite(nbp)) != 0) { + brelse(bp, 0); + goto fail; + } + if (unwindidx < 0) + unwindidx = i - 1; + /* XXX ondisk32 */ + bap[indirs[i - 1].in_off] = h2fs32((int32_t)nb); + /* + * If required, write synchronously, otherwise use + * delayed write. + */ + if (flags & B_SYNC) { + bwrite(bp); + } else { + bdwrite(bp); + } + } + /* + * Get the data block, allocating if necessary. + */ + if (nb == 0) { + pref = ext2fs_blkpref(ip, lbn, indirs[num].in_off, &bap[0]); + error = ext2fs_alloc(ip, lbn, pref, cred, &newb); + if (error) { + brelse(bp, 0); + goto fail; + } + nb = newb; + *allocblk++ = nb; + ip->i_e2fs_last_lblk = lbn; + ip->i_e2fs_last_blk = newb; + /* XXX ondisk32 */ + bap[indirs[num].in_off] = h2fs32((int32_t)nb); + /* + * If required, write synchronously, otherwise use + * delayed write. + */ + if (flags & B_SYNC) { + bwrite(bp); + } else { + bdwrite(bp); + } + if (bpp != NULL) { + nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); + if (flags & B_CLRBUF) + clrbuf(nbp); + *bpp = nbp; + } + return (0); + } + brelse(bp, 0); + if (bpp != NULL) { + if (flags & B_CLRBUF) { + error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, + B_MODIFY, &nbp); + if (error) { + brelse(nbp, 0); + goto fail; + } + } else { + nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); + } + *bpp = nbp; + } + return (0); +fail: + /* + * If we have failed part way through block allocation, we + * have to deallocate any indirect blocks that we have allocated. + */ + for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) { + ext2fs_blkfree(ip, *blkp); + deallocated += fs->e2fs_bsize; + } + if (unwindidx >= 0) { + if (unwindidx == 0) { + *allocib = 0; + } else { + int r; + + r = bread(vp, indirs[unwindidx].in_lbn, + (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp); + if (r) { + panic("Could not unwind indirect block, error %d", r); + brelse(bp, 0); + } else { + bap = (int32_t *)bp->b_data; /* XXX ondisk32 */ + bap[indirs[unwindidx].in_off] = 0; + if (flags & B_SYNC) + bwrite(bp); + else + bdwrite(bp); + } + } + for (i = unwindidx + 1; i <= num; i++) { + bp = getblk(vp, indirs[i].in_lbn, (int)fs->e2fs_bsize, + 0, 0); + brelse(bp, BC_INVAL); + } + } + if (deallocated) { + ip->i_e2fs_nblock -= btodb(deallocated); + ip->i_e2fs_flags |= IN_CHANGE | IN_UPDATE; + } + return error; +} + +int +ext2fs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags, + kauth_cred_t cred) +{ + struct inode *ip = VTOI(vp); + struct m_ext2fs *fs = ip->i_e2fs; + int error, delta, bshift, bsize; + UVMHIST_FUNC("ext2fs_gop_alloc"); UVMHIST_CALLED(ubchist); + + bshift = fs->e2fs_bshift; + bsize = 1 << bshift; + + delta = off & (bsize - 1); + off -= delta; + len += delta; + + while (len > 0) { + bsize = min(bsize, len); + UVMHIST_LOG(ubchist, "off 0x%x len 0x%x bsize 0x%x", + off, len, bsize, 0); + + error = ext2fs_balloc(ip, lblkno(fs, off), bsize, cred, + NULL, flags); + if (error) { + UVMHIST_LOG(ubchist, "error %d", error, 0,0,0); + return error; + } + + /* + * increase file size now, ext2fs_balloc() requires that + * EOF be up-to-date before each call. + */ + + if (ext2fs_size(ip) < off + bsize) { + UVMHIST_LOG(ubchist, "old 0x%lx%8lx new 0x%lx%8lx", + /* Note that arguments are always cast to u_long. */ + ext2fs_size(ip) >> 32, + ext2fs_size(ip) & 0xffffffff, + (off + bsize) >> 32, + (off + bsize) & 0xffffffff); + error = ext2fs_setsize(ip, off + bsize); + if (error) { + UVMHIST_LOG(ubchist, "error %d", error, 0,0,0); + return error; + } + } + + off += bsize; + len -= bsize; + } + return 0; +} diff --git a/sys/ufs/ext2fs/ext2fs_bmap.c b/sys/ufs/ext2fs/ext2fs_bmap.c new file mode 100644 index 000000000..5336fddc4 --- /dev/null +++ b/sys/ufs/ext2fs/ext2fs_bmap.c @@ -0,0 +1,269 @@ +/* $NetBSD: ext2fs_bmap.c,v 1.25 2009/10/19 18:41:17 bouyer Exp $ */ + +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_bmap.c 8.6 (Berkeley) 1/21/94 + * Modified for ext2fs by Manuel Bouyer. + */ + +/* + * Copyright (c) 1997 Manuel Bouyer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @(#)ufs_bmap.c 8.6 (Berkeley) 1/21/94 + * Modified for ext2fs by Manuel Bouyer. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ext2fs_bmap.c,v 1.25 2009/10/19 18:41:17 bouyer Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +static int ext2fs_bmaparray(struct vnode *, daddr_t, daddr_t *, + struct indir *, int *, int *); + +#define is_sequential(ump, a, b) ((b) == (a) + ump->um_seqinc) + +/* + * Bmap converts a the logical block number of a file to its physical block + * number on the disk. The conversion is done by using the logical block + * number to index into the array of block pointers described by the dinode. + */ +int +ext2fs_bmap(void *v) +{ + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap = v; + /* + * Check for underlying vnode requests and ensure that logical + * to physical mapping is requested. + */ + if (ap->a_vpp != NULL) + *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; + if (ap->a_bnp == NULL) + return (0); + + return (ext2fs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, + ap->a_runp)); +} + +/* + * Indirect blocks are now on the vnode for the file. They are given negative + * logical block numbers. Indirect blocks are addressed by the negative + * address of the first data block to which they point. Double indirect blocks + * are addressed by one less than the address of the first indirect block to + * which they point. Triple indirect blocks are addressed by one less than + * the address of the first double indirect block to which they point. + * + * ext2fs_bmaparray does the bmap conversion, and if requested returns the + * array of logical blocks which must be traversed to get to a block. + * Each entry contains the offset into that block that gets you to the + * next block and the disk address of the block (if it is assigned). + */ + +int +ext2fs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap, + int *nump, int *runp) +{ + struct inode *ip; + struct buf *bp, *cbp; + struct ufsmount *ump; + struct mount *mp; + struct indir a[NIADDR+1], *xap; + daddr_t daddr; + daddr_t metalbn; + int error, maxrun = 0, num; + + ip = VTOI(vp); + mp = vp->v_mount; + ump = ip->i_ump; +#ifdef DIAGNOSTIC + if ((ap != NULL && nump == NULL) || (ap == NULL && nump != NULL)) + panic("ext2fs_bmaparray: invalid arguments"); +#endif + + if (runp) { + /* + * XXX + * If MAXBSIZE is the largest transfer the disks can handle, + * we probably want maxrun to be 1 block less so that we + * don't create a block larger than the device can handle. + */ + *runp = 0; + maxrun = MAXBSIZE / mp->mnt_stat.f_iosize - 1; + } + + if (bn >= 0 && bn < NDADDR) { + /* XXX ondisk32 */ + *bnp = blkptrtodb(ump, fs2h32(ip->i_e2fs_blocks[bn])); + if (*bnp == 0) + *bnp = -1; + else if (runp) + /* XXX ondisk32 */ + for (++bn; bn < NDADDR && *runp < maxrun && + is_sequential(ump, (daddr_t)fs2h32(ip->i_e2fs_blocks[bn - 1]), + (daddr_t)fs2h32(ip->i_e2fs_blocks[bn])); + ++bn, ++*runp); + return (0); + } + + xap = ap == NULL ? a : ap; + if (!nump) + nump = # + if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0) + return (error); + + num = *nump; + + /* Get disk address out of indirect block array */ + /* XXX ondisk32 */ + daddr = fs2h32(ip->i_e2fs_blocks[NDADDR + xap->in_off]); + +#ifdef DIAGNOSTIC + if (num > NIADDR + 1 || num < 1) { + printf("ext2fs_bmaparray: num=%d\n", num); + panic("ext2fs_bmaparray: num"); + } +#endif + for (bp = NULL, ++xap; --num; ++xap) { + /* + * Exit the loop if there is no disk address assigned yet and + * the indirect block isn't in the cache, or if we were + * looking for an indirect block and we've found it. + */ + + metalbn = xap->in_lbn; + if (metalbn == bn) + break; + if (daddr == 0) { + mutex_enter(&bufcache_lock); + cbp = incore(vp, metalbn); + mutex_exit(&bufcache_lock); + if (cbp == NULL) + break; + } + /* + * If we get here, we've either got the block in the cache + * or we have a disk address for it, go fetch it. + */ + if (bp) + brelse(bp, 0); + + xap->in_exists = 1; + bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0); + if (bp == NULL) { + + /* + * getblk() above returns NULL only iff we are + * pagedaemon. See the implementation of getblk + * for detail. + */ + + return (ENOMEM); + } + if (bp->b_oflags & (BO_DONE | BO_DELWRI)) { + trace(TR_BREADHIT, pack(vp, size), metalbn); + } +#ifdef DIAGNOSTIC + else if (!daddr) + panic("ext2fs_bmaparry: indirect block not in cache"); +#endif + else { + trace(TR_BREADMISS, pack(vp, size), metalbn); + bp->b_blkno = blkptrtodb(ump, daddr); + bp->b_flags |= B_READ; + VOP_STRATEGY(vp, bp); + curlwp->l_ru.ru_inblock++; /* XXX */ + if ((error = biowait(bp)) != 0) { + brelse(bp, 0); + return (error); + } + } + + /* XXX ondisk32 */ + daddr = fs2h32(((int32_t *)bp->b_data)[xap->in_off]); + if (num == 1 && daddr && runp) + /* XXX ondisk32 */ + for (bn = xap->in_off + 1; + bn < MNINDIR(ump) && *runp < maxrun && + is_sequential(ump, ((int32_t *)bp->b_data)[bn - 1], + ((int32_t *)bp->b_data)[bn]); + ++bn, ++*runp); + } + if (bp) + brelse(bp, 0); + + daddr = blkptrtodb(ump, daddr); + *bnp = daddr == 0 ? -1 : daddr; + return (0); +} diff --git a/sys/ufs/ext2fs/ext2fs_bswap.c b/sys/ufs/ext2fs/ext2fs_bswap.c new file mode 100644 index 000000000..ba0ddc462 --- /dev/null +++ b/sys/ufs/ext2fs/ext2fs_bswap.c @@ -0,0 +1,121 @@ +/* $NetBSD: ext2fs_bswap.c,v 1.16 2009/10/19 18:41:17 bouyer Exp $ */ + +/* + * Copyright (c) 1997 Manuel Bouyer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ext2fs_bswap.c,v 1.16 2009/10/19 18:41:17 bouyer Exp $"); + +#include +#include +#include + +#if defined(_KERNEL) +#include +#else +#include +#endif + +/* These functions are only needed if native byte order is not big endian */ +#if BYTE_ORDER == BIG_ENDIAN +void +e2fs_sb_bswap(struct ext2fs *old, struct ext2fs *new) +{ + + /* preserve unused fields */ + memcpy(new, old, sizeof(struct ext2fs)); + new->e2fs_icount = bswap32(old->e2fs_icount); + new->e2fs_bcount = bswap32(old->e2fs_bcount); + new->e2fs_rbcount = bswap32(old->e2fs_rbcount); + new->e2fs_fbcount = bswap32(old->e2fs_fbcount); + new->e2fs_ficount = bswap32(old->e2fs_ficount); + new->e2fs_first_dblock = bswap32(old->e2fs_first_dblock); + new->e2fs_log_bsize = bswap32(old->e2fs_log_bsize); + new->e2fs_fsize = bswap32(old->e2fs_fsize); + new->e2fs_bpg = bswap32(old->e2fs_bpg); + new->e2fs_fpg = bswap32(old->e2fs_fpg); + new->e2fs_ipg = bswap32(old->e2fs_ipg); + new->e2fs_mtime = bswap32(old->e2fs_mtime); + new->e2fs_wtime = bswap32(old->e2fs_wtime); + new->e2fs_mnt_count = bswap16(old->e2fs_mnt_count); + new->e2fs_max_mnt_count = bswap16(old->e2fs_max_mnt_count); + new->e2fs_magic = bswap16(old->e2fs_magic); + new->e2fs_state = bswap16(old->e2fs_state); + new->e2fs_beh = bswap16(old->e2fs_beh); + new->e2fs_minrev = bswap16(old->e2fs_minrev); + new->e2fs_lastfsck = bswap32(old->e2fs_lastfsck); + new->e2fs_fsckintv = bswap32(old->e2fs_fsckintv); + new->e2fs_creator = bswap32(old->e2fs_creator); + new->e2fs_rev = bswap32(old->e2fs_rev); + new->e2fs_ruid = bswap16(old->e2fs_ruid); + new->e2fs_rgid = bswap16(old->e2fs_rgid); + new->e2fs_first_ino = bswap32(old->e2fs_first_ino); + new->e2fs_inode_size = bswap16(old->e2fs_inode_size); + new->e2fs_block_group_nr = bswap16(old->e2fs_block_group_nr); + new->e2fs_features_compat = bswap32(old->e2fs_features_compat); + new->e2fs_features_incompat = bswap32(old->e2fs_features_incompat); + new->e2fs_features_rocompat = bswap32(old->e2fs_features_rocompat); + new->e2fs_algo = bswap32(old->e2fs_algo); + new->e2fs_reserved_ngdb = bswap16(old->e2fs_reserved_ngdb); +} + +void e2fs_cg_bswap(struct ext2_gd *old, struct ext2_gd *new, int size) +{ + int i; + + for (i = 0; i < (size / (int)sizeof(struct ext2_gd)); i++) { + new[i].ext2bgd_b_bitmap = bswap32(old[i].ext2bgd_b_bitmap); + new[i].ext2bgd_i_bitmap = bswap32(old[i].ext2bgd_i_bitmap); + new[i].ext2bgd_i_tables = bswap32(old[i].ext2bgd_i_tables); + new[i].ext2bgd_nbfree = bswap16(old[i].ext2bgd_nbfree); + new[i].ext2bgd_nifree = bswap16(old[i].ext2bgd_nifree); + new[i].ext2bgd_ndirs = bswap16(old[i].ext2bgd_ndirs); + } +} + +void e2fs_i_bswap(struct ext2fs_dinode *old, struct ext2fs_dinode *new) +{ + + new->e2di_mode = bswap16(old->e2di_mode); + new->e2di_uid = bswap16(old->e2di_uid); + new->e2di_gid = bswap16(old->e2di_gid); + new->e2di_nlink = bswap16(old->e2di_nlink); + new->e2di_size = bswap32(old->e2di_size); + new->e2di_atime = bswap32(old->e2di_atime); + new->e2di_ctime = bswap32(old->e2di_ctime); + new->e2di_mtime = bswap32(old->e2di_mtime); + new->e2di_dtime = bswap32(old->e2di_dtime); + new->e2di_nblock = bswap32(old->e2di_nblock); + new->e2di_flags = bswap32(old->e2di_flags); + new->e2di_gen = bswap32(old->e2di_gen); + new->e2di_facl = bswap32(old->e2di_facl); + new->e2di_dacl = bswap32(old->e2di_dacl); + new->e2di_faddr = bswap32(old->e2di_faddr); + new->e2di_uid_high = bswap16(old->e2di_uid_high); + new->e2di_gid_high = bswap16(old->e2di_gid_high); + memcpy(&new->e2di_blocks[0], &old->e2di_blocks[0], + (NDADDR + NIADDR) * sizeof(uint32_t)); +} +#endif diff --git a/include/ufs/ext2fs/ext2fs_dinode.h b/sys/ufs/ext2fs/ext2fs_dinode.h similarity index 100% rename from include/ufs/ext2fs/ext2fs_dinode.h rename to sys/ufs/ext2fs/ext2fs_dinode.h diff --git a/include/ufs/ext2fs/ext2fs_dir.h b/sys/ufs/ext2fs/ext2fs_dir.h similarity index 100% rename from include/ufs/ext2fs/ext2fs_dir.h rename to sys/ufs/ext2fs/ext2fs_dir.h diff --git a/include/ufs/ext2fs/ext2fs_extern.h b/sys/ufs/ext2fs/ext2fs_extern.h similarity index 100% rename from include/ufs/ext2fs/ext2fs_extern.h rename to sys/ufs/ext2fs/ext2fs_extern.h diff --git a/sys/ufs/ext2fs/ext2fs_inode.c b/sys/ufs/ext2fs/ext2fs_inode.c new file mode 100644 index 000000000..0d52fb494 --- /dev/null +++ b/sys/ufs/ext2fs/ext2fs_inode.c @@ -0,0 +1,558 @@ +/* $NetBSD: ext2fs_inode.c,v 1.74 2011/06/16 09:21:03 hannken Exp $ */ + +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_inode.c 8.8 (Berkeley) 10/19/94 + * Modified for ext2fs by Manuel Bouyer. + */ + +/* + * Copyright (c) 1997 Manuel Bouyer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @(#)ffs_inode.c 8.8 (Berkeley) 10/19/94 + * Modified for ext2fs by Manuel Bouyer. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ext2fs_inode.c,v 1.74 2011/06/16 09:21:03 hannken Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +extern int prtactive; + +static int ext2fs_indirtrunc(struct inode *, daddr_t, daddr_t, + daddr_t, int, long *); + +/* + * Get the size of an inode. + */ +uint64_t +ext2fs_size(struct inode *ip) +{ + uint64_t size = ip->i_e2fs_size; + + if ((ip->i_e2fs_mode & IFMT) == IFREG) + size |= (uint64_t)ip->i_e2fs_dacl << 32; + return size; +} + +int +ext2fs_setsize(struct inode *ip, uint64_t size) +{ + if ((ip->i_e2fs_mode & IFMT) == IFREG || + ip->i_e2fs_mode == 0) { + ip->i_e2fs_dacl = size >> 32; + if (size >= 0x80000000U) { + struct m_ext2fs *fs = ip->i_e2fs; + + if (fs->e2fs.e2fs_rev <= E2FS_REV0) { + /* Linux automagically upgrades to REV1 here! */ + return EFBIG; + } + if (!(fs->e2fs.e2fs_features_rocompat + & EXT2F_ROCOMPAT_LARGEFILE)) { + fs->e2fs.e2fs_features_rocompat |= + EXT2F_ROCOMPAT_LARGEFILE; + fs->e2fs_fmod = 1; + } + } + } else if (size >= 0x80000000U) + return EFBIG; + + ip->i_e2fs_size = size; + + return 0; +} + +/* + * Last reference to an inode. If necessary, write or delete it. + */ +int +ext2fs_inactive(void *v) +{ + struct vop_inactive_args /* { + struct vnode *a_vp; + bool *a_recycle; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + int error = 0; + + if (prtactive && vp->v_usecount != 0) + vprint("ext2fs_inactive: pushing active", vp); + /* Get rid of inodes related to stale file handles. */ + if (ip->i_e2fs_mode == 0 || ip->i_e2fs_dtime != 0) + goto out; + + error = 0; + if (ip->i_e2fs_nlink == 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { + /* Defer final inode free and update to reclaim.*/ + if (ext2fs_size(ip) != 0) { + error = ext2fs_truncate(vp, (off_t)0, 0, NOCRED); + } + ip->i_e2fs_dtime = time_second; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + ip->i_omode = 1; + } + if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) { + ext2fs_update(vp, NULL, NULL, 0); + } +out: + /* + * If we are done with the inode, reclaim it + * so that it can be reused immediately. + */ + *ap->a_recycle = (ip->i_e2fs_dtime != 0); + VOP_UNLOCK(vp); + return (error); +} + + +/* + * Update the access, modified, and inode change times as specified by the + * IACCESS, IUPDATE, and ICHANGE flags respectively. The IMODIFIED flag is + * used to specify that the inode needs to be updated but that the times have + * already been set. The access and modified times are taken from the second + * and third parameters; the inode change time is always taken from the current + * time. If UPDATE_WAIT or UPDATE_DIROP is set, then wait for the disk + * write of the inode to complete. + */ +int +ext2fs_update(struct vnode *vp, const struct timespec *acc, + const struct timespec *mod, int updflags) +{ + struct m_ext2fs *fs; + struct buf *bp; + struct inode *ip; + int error; + void *cp; + int flags; + + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (0); + ip = VTOI(vp); + EXT2FS_ITIMES(ip, acc, mod, NULL); + if (updflags & UPDATE_CLOSE) + flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED); + else + flags = ip->i_flag & IN_MODIFIED; + if (flags == 0) + return (0); + fs = ip->i_e2fs; + + error = bread(ip->i_devvp, + fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + return (error); + } + ip->i_flag &= ~(IN_MODIFIED | IN_ACCESSED); + cp = (char *)bp->b_data + + (ino_to_fsbo(fs, ip->i_number) * EXT2_DINODE_SIZE(fs)); + e2fs_isave(ip->i_din.e2fs_din, (struct ext2fs_dinode *)cp); + if ((updflags & (UPDATE_WAIT|UPDATE_DIROP)) != 0 && + (flags & IN_MODIFIED) != 0 && + (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) + return (bwrite(bp)); + else { + bdwrite(bp); + return (0); + } +} + +#define SINGLE 0 /* index of single indirect block */ +#define DOUBLE 1 /* index of double indirect block */ +#define TRIPLE 2 /* index of triple indirect block */ +/* + * Truncate the inode oip to at most length size, freeing the + * disk blocks. + */ +int +ext2fs_truncate(struct vnode *ovp, off_t length, int ioflag, + kauth_cred_t cred) +{ + daddr_t lastblock; + struct inode *oip = VTOI(ovp); + daddr_t bn, lastiblock[NIADDR], indir_lbn[NIADDR]; + /* XXX ondisk32 */ + int32_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; + struct m_ext2fs *fs; + int offset, size, level; + long count, blocksreleased = 0; + int i, nblocks; + int error, allerror = 0; + off_t osize; + int sync; + struct ufsmount *ump = oip->i_ump; + + if (ovp->v_type == VCHR || ovp->v_type == VBLK || + ovp->v_type == VFIFO || ovp->v_type == VSOCK) { + return 0; + } + + if (length < 0) + return (EINVAL); + + if (ovp->v_type == VLNK && + (ext2fs_size(oip) < ump->um_maxsymlinklen || + (ump->um_maxsymlinklen == 0 && oip->i_e2fs_nblock == 0))) { + KDASSERT(length == 0); + memset((char *)&oip->i_din.e2fs_din->e2di_shortlink, 0, + (u_int)ext2fs_size(oip)); + (void)ext2fs_setsize(oip, 0); + oip->i_flag |= IN_CHANGE | IN_UPDATE; + return (ext2fs_update(ovp, NULL, NULL, 0)); + } + if (ext2fs_size(oip) == length) { + /* still do a uvm_vnp_setsize() as writesize may be larger */ + uvm_vnp_setsize(ovp, length); + oip->i_flag |= IN_CHANGE | IN_UPDATE; + return (ext2fs_update(ovp, NULL, NULL, 0)); + } + fs = oip->i_e2fs; + if (length > ump->um_maxfilesize) + return (EFBIG); + + osize = ext2fs_size(oip); + + /* + * Lengthen the size of the file. We must ensure that the + * last byte of the file is allocated. Since the smallest + * value of osize is 0, length will be at least 1. + */ + if (osize < length) { + uvm_vnp_setwritesize(ovp, length); + error = ufs_balloc_range(ovp, length - 1, 1, cred, + ioflag & IO_SYNC ? B_SYNC : 0); + if (error) { + (void) ext2fs_truncate(ovp, osize, ioflag & IO_SYNC, + cred); + return (error); + } + uvm_vnp_setsize(ovp, length); + oip->i_flag |= IN_CHANGE | IN_UPDATE; + KASSERT(error || ovp->v_size == ext2fs_size(oip)); + return (ext2fs_update(ovp, NULL, NULL, 0)); + } + /* + * Shorten the size of the file. If the file is not being + * truncated to a block boundry, the contents of the + * partial block following the end of the file must be + * zero'ed in case it ever become accessible again because + * of subsequent file growth. + */ + offset = blkoff(fs, length); + if (offset != 0) { + size = fs->e2fs_bsize; + + /* XXXUBC we should handle more than just VREG */ + ubc_zerorange(&ovp->v_uobj, length, size - offset, + UBC_UNMAP_FLAG(ovp)); + } + (void)ext2fs_setsize(oip, length); + uvm_vnp_setsize(ovp, length); + /* + * Calculate index into inode's block list of + * last direct and indirect blocks (if any) + * which we want to keep. Lastblock is -1 when + * the file is truncated to 0. + */ + lastblock = lblkno(fs, length + fs->e2fs_bsize - 1) - 1; + lastiblock[SINGLE] = lastblock - NDADDR; + lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); + lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); + nblocks = btodb(fs->e2fs_bsize); + /* + * Update file and block pointers on disk before we start freeing + * blocks. If we crash before free'ing blocks below, the blocks + * will be returned to the free list. lastiblock values are also + * normalized to -1 for calls to ext2fs_indirtrunc below. + */ + memcpy((void *)oldblks, (void *)&oip->i_e2fs_blocks[0], sizeof oldblks); + sync = 0; + for (level = TRIPLE; level >= SINGLE; level--) { + if (lastiblock[level] < 0 && oldblks[NDADDR + level] != 0) { + sync = 1; + oip->i_e2fs_blocks[NDADDR + level] = 0; + lastiblock[level] = -1; + } + } + for (i = 0; i < NDADDR; i++) { + if (i > lastblock && oldblks[i] != 0) { + sync = 1; + oip->i_e2fs_blocks[i] = 0; + } + } + oip->i_flag |= IN_CHANGE | IN_UPDATE; + if (sync) { + error = ext2fs_update(ovp, NULL, NULL, UPDATE_WAIT); + if (error && !allerror) + allerror = error; + } + + /* + * Having written the new inode to disk, save its new configuration + * and put back the old block pointers long enough to process them. + * Note that we save the new block configuration so we can check it + * when we are done. + */ + memcpy((void *)newblks, (void *)&oip->i_e2fs_blocks[0], sizeof newblks); + memcpy((void *)&oip->i_e2fs_blocks[0], (void *)oldblks, sizeof oldblks); + + (void)ext2fs_setsize(oip, osize); + error = vtruncbuf(ovp, lastblock + 1, 0, 0); + if (error && !allerror) + allerror = error; + + /* + * Indirect blocks first. + */ + indir_lbn[SINGLE] = -NDADDR; + indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) -1; + indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; + for (level = TRIPLE; level >= SINGLE; level--) { + /* XXX ondisk32 */ + bn = fs2h32(oip->i_e2fs_blocks[NDADDR + level]); + if (bn != 0) { + error = ext2fs_indirtrunc(oip, indir_lbn[level], + fsbtodb(fs, bn), lastiblock[level], level, &count); + if (error) + allerror = error; + blocksreleased += count; + if (lastiblock[level] < 0) { + oip->i_e2fs_blocks[NDADDR + level] = 0; + ext2fs_blkfree(oip, bn); + blocksreleased += nblocks; + } + } + if (lastiblock[level] >= 0) + goto done; + } + + /* + * All whole direct blocks or frags. + */ + for (i = NDADDR - 1; i > lastblock; i--) { + /* XXX ondisk32 */ + bn = fs2h32(oip->i_e2fs_blocks[i]); + if (bn == 0) + continue; + oip->i_e2fs_blocks[i] = 0; + ext2fs_blkfree(oip, bn); + blocksreleased += btodb(fs->e2fs_bsize); + } + +done: +#ifdef DIAGNOSTIC + for (level = SINGLE; level <= TRIPLE; level++) + if (newblks[NDADDR + level] != + oip->i_e2fs_blocks[NDADDR + level]) + panic("ext2fs_truncate1"); + for (i = 0; i < NDADDR; i++) + if (newblks[i] != oip->i_e2fs_blocks[i]) + panic("ext2fs_truncate2"); + if (length == 0 && + (!LIST_EMPTY(&ovp->v_cleanblkhd) || + !LIST_EMPTY(&ovp->v_dirtyblkhd))) + panic("ext2fs_truncate3"); +#endif /* DIAGNOSTIC */ + /* + * Put back the real size. + */ + (void)ext2fs_setsize(oip, length); + oip->i_e2fs_nblock -= blocksreleased; + oip->i_flag |= IN_CHANGE; + KASSERT(ovp->v_type != VREG || ovp->v_size == ext2fs_size(oip)); + return (allerror); +} + +/* + * Release blocks associated with the inode ip and stored in the indirect + * block bn. Blocks are free'd in LIFO order up to (but not including) + * lastbn. If level is greater than SINGLE, the block is an indirect block + * and recursive calls to indirtrunc must be used to cleanse other indirect + * blocks. + * + * NB: triple indirect blocks are untested. + */ +static int +ext2fs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn, + int level, long *countp) +{ + int i; + struct buf *bp; + struct m_ext2fs *fs = ip->i_e2fs; + int32_t *bap; /* XXX ondisk32 */ + struct vnode *vp; + daddr_t nb, nlbn, last; + int32_t *copy = NULL; /* XXX ondisk32 */ + long blkcount, factor; + int nblocks, blocksreleased = 0; + int error = 0, allerror = 0; + + /* + * Calculate index in current block of last + * block to be kept. -1 indicates the entire + * block so we need not calculate the index. + */ + factor = 1; + for (i = SINGLE; i < level; i++) + factor *= NINDIR(fs); + last = lastbn; + if (lastbn > 0) + last /= factor; + nblocks = btodb(fs->e2fs_bsize); + /* + * Get buffer of block pointers, zero those entries corresponding + * to blocks to be free'd, and update on disk copy first. Since + * double(triple) indirect before single(double) indirect, calls + * to bmap on these blocks will fail. However, we already have + * the on disk address, so we have to set the b_blkno field + * explicitly instead of letting bread do everything for us. + */ + vp = ITOV(ip); + bp = getblk(vp, lbn, (int)fs->e2fs_bsize, 0, 0); + if (bp->b_oflags & (BO_DONE | BO_DELWRI)) { + /* Braces must be here in case trace evaluates to nothing. */ + trace(TR_BREADHIT, pack(vp, fs->e2fs_bsize), lbn); + } else { + trace(TR_BREADMISS, pack(vp, fs->e2fs_bsize), lbn); + curlwp->l_ru.ru_inblock++; /* pay for read */ + bp->b_flags |= B_READ; + if (bp->b_bcount > bp->b_bufsize) + panic("ext2fs_indirtrunc: bad buffer size"); + bp->b_blkno = dbn; + VOP_STRATEGY(vp, bp); + error = biowait(bp); + } + if (error) { + brelse(bp, 0); + *countp = 0; + return (error); + } + + bap = (int32_t *)bp->b_data; /* XXX ondisk32 */ + if (lastbn >= 0) { + /* XXX ondisk32 */ + copy = malloc(fs->e2fs_bsize, M_TEMP, M_WAITOK); + memcpy((void *)copy, (void *)bap, (u_int)fs->e2fs_bsize); + memset((void *)&bap[last + 1], 0, + (u_int)(NINDIR(fs) - (last + 1)) * sizeof (uint32_t)); + error = bwrite(bp); + if (error) + allerror = error; + bap = copy; + } + + /* + * Recursively free totally unused blocks. + */ + for (i = NINDIR(fs) - 1, + nlbn = lbn + 1 - i * factor; i > last; + i--, nlbn += factor) { + /* XXX ondisk32 */ + nb = fs2h32(bap[i]); + if (nb == 0) + continue; + if (level > SINGLE) { + error = ext2fs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), + (daddr_t)-1, level - 1, + &blkcount); + if (error) + allerror = error; + blocksreleased += blkcount; + } + ext2fs_blkfree(ip, nb); + blocksreleased += nblocks; + } + + /* + * Recursively free last partial block. + */ + if (level > SINGLE && lastbn >= 0) { + last = lastbn % factor; + /* XXX ondisk32 */ + nb = fs2h32(bap[i]); + if (nb != 0) { + error = ext2fs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), + last, level - 1, &blkcount); + if (error) + allerror = error; + blocksreleased += blkcount; + } + } + + if (copy != NULL) { + free(copy, M_TEMP); + } else { + brelse(bp, BC_INVAL); + } + + *countp = blocksreleased; + return (allerror); +} diff --git a/sys/ufs/ext2fs/ext2fs_lookup.c b/sys/ufs/ext2fs/ext2fs_lookup.c new file mode 100644 index 000000000..eb59c45a0 --- /dev/null +++ b/sys/ufs/ext2fs/ext2fs_lookup.c @@ -0,0 +1,1079 @@ +/* $NetBSD: ext2fs_lookup.c,v 1.66 2011/07/12 16:59:48 dholland Exp $ */ + +/* + * Modified for NetBSD 1.2E + * May 1997, Manuel Bouyer + * Laboratoire d'informatique de Paris VI + */ +/* + * modified for Lites 1.1 + * + * Aug 1995, Godmar Back (gback@cs.utah.edu) + * University of Utah, Department of Computer Science + */ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_lookup.c 8.6 (Berkeley) 4/1/94 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ext2fs_lookup.c,v 1.66 2011/07/12 16:59:48 dholland Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +extern int dirchk; + +static void ext2fs_dirconv2ffs(struct ext2fs_direct *e2dir, + struct dirent *ffsdir); +static int ext2fs_dirbadentry(struct vnode *dp, + struct ext2fs_direct *de, + int entryoffsetinblock); + +/* + * the problem that is tackled below is the fact that FFS + * includes the terminating zero on disk while EXT2FS doesn't + * this implies that we need to introduce some padding. + * For instance, a filename "sbin" has normally a reclen 12 + * in EXT2, but 16 in FFS. + * This reminds me of that Pepsi commercial: 'Kid saved a lousy nine cents...' + * If it wasn't for that, the complete ufs code for directories would + * have worked w/o changes (except for the difference in DIRBLKSIZ) + */ +static void +ext2fs_dirconv2ffs(struct ext2fs_direct *e2dir, struct dirent *ffsdir) +{ + memset(ffsdir, 0, sizeof(struct dirent)); + ffsdir->d_fileno = fs2h32(e2dir->e2d_ino); + ffsdir->d_namlen = e2dir->e2d_namlen; + + ffsdir->d_type = DT_UNKNOWN; /* don't know more here */ +#ifdef DIAGNOSTIC +#if MAXNAMLEN < E2FS_MAXNAMLEN + /* + * we should handle this more gracefully ! + */ + if (e2dir->e2d_namlen > MAXNAMLEN) + panic("ext2fs: e2dir->e2d_namlen"); +#endif +#endif + strncpy(ffsdir->d_name, e2dir->e2d_name, ffsdir->d_namlen); + + /* Godmar thinks: since e2dir->e2d_reclen can be big and means + nothing anyway, we compute our own reclen according to what + we think is right + */ + ffsdir->d_reclen = _DIRENT_SIZE(ffsdir); +} + +/* + * Vnode op for reading directories. + * + * Convert the on-disk entries to entries. + * the problem is that the conversion will blow up some entries by four bytes, + * so it can't be done in place. This is too bad. Right now the conversion is + * done entry by entry, the converted entry is sent via uiomove. + * + * XXX allocate a buffer, convert as many entries as possible, then send + * the whole buffer to uiomove + */ +int +ext2fs_readdir(void *v) +{ + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + kauth_cred_t a_cred; + int **a_eofflag; + off_t **a_cookies; + int ncookies; + } */ *ap = v; + struct uio *uio = ap->a_uio; + int error; + size_t e2fs_count, readcnt; + struct vnode *vp = ap->a_vp; + struct m_ext2fs *fs = VTOI(vp)->i_e2fs; + + struct ext2fs_direct *dp; + struct dirent *dstd; + struct uio auio; + struct iovec aiov; + void *dirbuf; + off_t off = uio->uio_offset; + off_t *cookies = NULL; + int nc = 0, ncookies = 0; + int e2d_reclen; + + if (vp->v_type != VDIR) + return (ENOTDIR); + + e2fs_count = uio->uio_resid; + /* Make sure we don't return partial entries. */ + e2fs_count -= (uio->uio_offset + e2fs_count) & (fs->e2fs_bsize -1); + if (e2fs_count <= 0) + return (EINVAL); + + auio = *uio; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_len = e2fs_count; + auio.uio_resid = e2fs_count; + UIO_SETUP_SYSSPACE(&auio); + dirbuf = malloc(e2fs_count, M_TEMP, M_WAITOK); + dstd = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK | M_ZERO); + if (ap->a_ncookies) { + nc = e2fs_count / _DIRENT_MINSIZE((struct dirent *)0); + ncookies = nc; + cookies = malloc(sizeof (off_t) * ncookies, M_TEMP, M_WAITOK); + *ap->a_cookies = cookies; + } + memset(dirbuf, 0, e2fs_count); + aiov.iov_base = dirbuf; + + error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred); + if (error == 0) { + readcnt = e2fs_count - auio.uio_resid; + for (dp = (struct ext2fs_direct *)dirbuf; + (char *)dp < (char *)dirbuf + readcnt; ) { + e2d_reclen = fs2h16(dp->e2d_reclen); + if (e2d_reclen == 0) { + error = EIO; + break; + } + ext2fs_dirconv2ffs(dp, dstd); + if(dstd->d_reclen > uio->uio_resid) { + break; + } + error = uiomove(dstd, dstd->d_reclen, uio); + if (error != 0) { + break; + } + off = off + e2d_reclen; + if (cookies != NULL) { + *cookies++ = off; + if (--ncookies <= 0){ + break; /* out of cookies */ + } + } + /* advance dp */ + dp = (struct ext2fs_direct *) ((char *)dp + e2d_reclen); + } + /* we need to correct uio_offset */ + uio->uio_offset = off; + } + free(dirbuf, M_TEMP); + free(dstd, M_TEMP); + *ap->a_eofflag = ext2fs_size(VTOI(ap->a_vp)) <= uio->uio_offset; + if (ap->a_ncookies) { + if (error) { + free(*ap->a_cookies, M_TEMP); + *ap->a_ncookies = 0; + *ap->a_cookies = NULL; + } else + *ap->a_ncookies = nc - ncookies; + } + return (error); +} + +/* + * Convert a component of a pathname into a pointer to a locked inode. + * This is a very central and rather complicated routine. + * If the file system is not maintained in a strict tree hierarchy, + * this can result in a deadlock situation (see comments in code below). + * + * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending + * on whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it and the target of the pathname + * exists, lookup returns both the target and its parent directory locked. + * When creating or renaming and LOCKPARENT is specified, the target may + * not be ".". When deleting and LOCKPARENT is specified, the target may + * be "."., but the caller must check to ensure it does an vrele and vput + * instead of two vputs. + * + * Overall outline of ext2fs_lookup: + * + * check accessibility of directory + * look for name in cache, if found, then if at end of path + * and deleting or creating, drop it, else return name + * search for name in directory, to found or notfound + * notfound: + * if creating, return locked directory, leaving info on available slots + * else return error + * found: + * if at end of path and deleting, return information to allow delete + * if at end of path and rewriting (RENAME and LOCKPARENT), lock target + * inode and return info to allow rewrite + * if not at end, add name to cache; if at end and neither creating + * nor deleting, add name to cache + */ +int +ext2fs_lookup(void *v) +{ + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap = v; + struct vnode *vdp = ap->a_dvp; /* vnode for directory being searched */ + struct inode *dp = VTOI(vdp); /* inode for directory being searched */ + struct buf *bp; /* a buffer of directory entries */ + struct ext2fs_direct *ep; /* the current directory entry */ + int entryoffsetinblock; /* offset of ep in bp's buffer */ + enum {NONE, COMPACT, FOUND} slotstatus; + doff_t slotoffset; /* offset of area with free space */ + int slotsize; /* size of area at slotoffset */ + int slotfreespace; /* amount of space free in slot */ + int slotneeded; /* size of the entry we're seeking */ + int numdirpasses; /* strategy for directory search */ + doff_t endsearch; /* offset to end directory search */ + doff_t prevoff; /* prev entry dp->i_offset */ + struct vnode *pdp; /* saved dp during symlink work */ + struct vnode *tdp; /* returned by VFS_VGET */ + doff_t enduseful; /* pointer past last used dir slot */ + u_long bmask; /* block offset mask */ + int namlen, error; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + kauth_cred_t cred = cnp->cn_cred; + int flags; + int nameiop = cnp->cn_nameiop; + struct ufsmount *ump = dp->i_ump; + int dirblksiz = ump->um_dirblksiz; + ino_t foundino; + struct ufs_lookup_results *results; + + flags = cnp->cn_flags; + + bp = NULL; + slotoffset = -1; + *vpp = NULL; + + /* + * Produce the auxiliary lookup results into i_crap. Increment + * its serial number so elsewhere we can tell if we're using + * stale results. This should not be done this way. XXX. + */ + results = &dp->i_crap; + dp->i_crapcounter++; + + /* + * Check accessiblity of directory. + */ + if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0) + return (error); + + if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) + return (EROFS); + + /* + * We now have a segment name to search for, and a directory to search. + * + * Before tediously performing a linear scan of the directory, + * check the name cache to see if the directory/name pair + * we are looking for is known already. + */ + if ((error = cache_lookup(vdp, vpp, cnp)) >= 0) + return (error); + + /* + * Suppress search for slots unless creating + * file and at end of pathname, in which case + * we watch for a place to put the new file in + * case it doesn't already exist. + */ + slotstatus = FOUND; + slotfreespace = slotsize = slotneeded = 0; + if ((nameiop == CREATE || nameiop == RENAME) && + (flags & ISLASTCN)) { + slotstatus = NONE; + slotneeded = EXT2FS_DIRSIZ(cnp->cn_namelen); + } + + /* + * If there is cached information on a previous search of + * this directory, pick up where we last left off. + * We cache only lookups as these are the most common + * and have the greatest payoff. Caching CREATE has little + * benefit as it usually must search the entire directory + * to determine that the entry does not exist. Caching the + * location of the last DELETE or RENAME has not reduced + * profiling time and hence has been removed in the interest + * of simplicity. + */ + bmask = vdp->v_mount->mnt_stat.f_iosize - 1; + if (nameiop != LOOKUP || results->ulr_diroff == 0 || + results->ulr_diroff >= ext2fs_size(dp)) { + entryoffsetinblock = 0; + results->ulr_offset = 0; + numdirpasses = 1; + } else { + results->ulr_offset = results->ulr_diroff; + if ((entryoffsetinblock = results->ulr_offset & bmask) && + (error = ext2fs_blkatoff(vdp, (off_t)results->ulr_offset, NULL, &bp))) + return (error); + numdirpasses = 2; + nchstats.ncs_2passes++; + } + prevoff = results->ulr_offset; + endsearch = roundup(ext2fs_size(dp), dirblksiz); + enduseful = 0; + +searchloop: + while (results->ulr_offset < endsearch) { + if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) + preempt(); + /* + * If necessary, get the next directory block. + */ + if ((results->ulr_offset & bmask) == 0) { + if (bp != NULL) + brelse(bp, 0); + error = ext2fs_blkatoff(vdp, (off_t)results->ulr_offset, NULL, + &bp); + if (error != 0) + return (error); + entryoffsetinblock = 0; + } + /* + * If still looking for a slot, and at a dirblksize + * boundary, have to start looking for free space again. + */ + if (slotstatus == NONE && + (entryoffsetinblock & (dirblksiz - 1)) == 0) { + slotoffset = -1; + slotfreespace = 0; + } + /* + * Get pointer to next entry. + * Full validation checks are slow, so we only check + * enough to insure forward progress through the + * directory. Complete checks can be run by patching + * "dirchk" to be true. + */ + KASSERT(bp != NULL); + ep = (struct ext2fs_direct *) + ((char *)bp->b_data + entryoffsetinblock); + if (ep->e2d_reclen == 0 || + (dirchk && + ext2fs_dirbadentry(vdp, ep, entryoffsetinblock))) { + int i; + + ufs_dirbad(dp, results->ulr_offset, "mangled entry"); + i = dirblksiz - (entryoffsetinblock & (dirblksiz - 1)); + results->ulr_offset += i; + entryoffsetinblock += i; + continue; + } + + /* + * If an appropriate sized slot has not yet been found, + * check to see if one is available. Also accumulate space + * in the current block so that we can determine if + * compaction is viable. + */ + if (slotstatus != FOUND) { + int size = fs2h16(ep->e2d_reclen); + + if (ep->e2d_ino != 0) + size -= EXT2FS_DIRSIZ(ep->e2d_namlen); + if (size > 0) { + if (size >= slotneeded) { + slotstatus = FOUND; + slotoffset = results->ulr_offset; + slotsize = fs2h16(ep->e2d_reclen); + } else if (slotstatus == NONE) { + slotfreespace += size; + if (slotoffset == -1) + slotoffset = results->ulr_offset; + if (slotfreespace >= slotneeded) { + slotstatus = COMPACT; + slotsize = results->ulr_offset + + fs2h16(ep->e2d_reclen) - + slotoffset; + } + } + } + } + + /* + * Check for a name match. + */ + if (ep->e2d_ino) { + namlen = ep->e2d_namlen; + if (namlen == cnp->cn_namelen && + !memcmp(cnp->cn_nameptr, ep->e2d_name, + (unsigned)namlen)) { + /* + * Save directory entry's inode number and + * reclen in ndp->ni_ufs area, and release + * directory buffer. + */ + foundino = fs2h32(ep->e2d_ino); + results->ulr_reclen = fs2h16(ep->e2d_reclen); + goto found; + } + } + prevoff = results->ulr_offset; + results->ulr_offset += fs2h16(ep->e2d_reclen); + entryoffsetinblock += fs2h16(ep->e2d_reclen); + if (ep->e2d_ino) + enduseful = results->ulr_offset; + } +/* notfound: */ + /* + * If we started in the middle of the directory and failed + * to find our target, we must check the beginning as well. + */ + if (numdirpasses == 2) { + numdirpasses--; + results->ulr_offset = 0; + endsearch = results->ulr_diroff; + goto searchloop; + } + if (bp != NULL) + brelse(bp, 0); + /* + * If creating, and at end of pathname and current + * directory has not been removed, then can consider + * allowing file to be created. + */ + if ((nameiop == CREATE || nameiop == RENAME) && + (flags & ISLASTCN) && dp->i_e2fs_nlink != 0) { + /* + * Access for write is interpreted as allowing + * creation of files in the directory. + */ + error = VOP_ACCESS(vdp, VWRITE, cred); + if (error) + return (error); + /* + * Return an indication of where the new directory + * entry should be put. If we didn't find a slot, + * then set results->ulr_count to 0 indicating + * that the new slot belongs at the end of the + * directory. If we found a slot, then the new entry + * can be put in the range from results->ulr_offset to + * results->ulr_offset + results->ulr_count. + */ + if (slotstatus == NONE) { + results->ulr_offset = roundup(ext2fs_size(dp), dirblksiz); + results->ulr_count = 0; + enduseful = results->ulr_offset; + } else { + results->ulr_offset = slotoffset; + results->ulr_count = slotsize; + if (enduseful < slotoffset + slotsize) + enduseful = slotoffset + slotsize; + } + results->ulr_endoff = roundup(enduseful, dirblksiz); +#if 0 + dp->i_flag |= IN_CHANGE | IN_UPDATE; +#endif + /* + * We return with the directory locked, so that + * the parameters we set up above will still be + * valid if we actually decide to do a direnter(). + * We return ni_vp == NULL to indicate that the entry + * does not currently exist; we leave a pointer to + * the (locked) directory inode in ndp->ni_dvp. + * + * NB - if the directory is unlocked, then this + * information cannot be used. + */ + return (EJUSTRETURN); + } + /* + * Insert name into cache (as non-existent) if appropriate. + */ + if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) + cache_enter(vdp, *vpp, cnp); + return (ENOENT); + +found: + if (numdirpasses == 2) + nchstats.ncs_pass2++; + /* + * Check that directory length properly reflects presence + * of this entry. + */ + if (results->ulr_offset + EXT2FS_DIRSIZ(ep->e2d_namlen) > ext2fs_size(dp)) { + ufs_dirbad(dp, results->ulr_offset, "i_size too small"); + error = ext2fs_setsize(dp, + results->ulr_offset + EXT2FS_DIRSIZ(ep->e2d_namlen)); + if (error) { + brelse(bp, 0); + return (error); + } + dp->i_flag |= IN_CHANGE | IN_UPDATE; + uvm_vnp_setsize(vdp, ext2fs_size(dp)); + } + brelse(bp, 0); + + /* + * Found component in pathname. + * If the final component of path name, save information + * in the cache as to where the entry was found. + */ + if ((flags & ISLASTCN) && nameiop == LOOKUP) + results->ulr_diroff = results->ulr_offset &~ (dirblksiz - 1); + + /* + * If deleting, and at end of pathname, return + * parameters which can be used to remove file. + * Lock the inode, being careful with ".". + */ + if (nameiop == DELETE && (flags & ISLASTCN)) { + /* + * Write access to directory required to delete files. + */ + if ((error = VOP_ACCESS(vdp, VWRITE, cred)) != 0) + return (error); + /* + * Return pointer to current entry in results->ulr_offset, + * and distance past previous entry (if there + * is a previous entry in this block) in results->ulr_count. + * Save directory inode pointer in ndp->ni_dvp for dirremove(). + */ + if ((results->ulr_offset & (dirblksiz - 1)) == 0) + results->ulr_count = 0; + else + results->ulr_count = results->ulr_offset - prevoff; + if (dp->i_number == foundino) { + vref(vdp); + *vpp = vdp; + return (0); + } + if (flags & ISDOTDOT) + VOP_UNLOCK(vdp); /* race to get the inode */ + error = VFS_VGET(vdp->v_mount, foundino, &tdp); + if (flags & ISDOTDOT) + vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY); + if (error) + return (error); + /* + * If directory is "sticky", then user must own + * the directory, or the file in it, else she + * may not delete it (unless she's root). This + * implements append-only directories. + */ + if ((dp->i_e2fs_mode & ISVTX) && + kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL) && + kauth_cred_geteuid(cred) != dp->i_uid && + VTOI(tdp)->i_uid != kauth_cred_geteuid(cred)) { + vput(tdp); + return (EPERM); + } + *vpp = tdp; + return (0); + } + + /* + * If rewriting (RENAME), return the inode and the + * information required to rewrite the present directory + * Must get inode of directory entry to verify it's a + * regular file, or empty directory. + */ + if (nameiop == RENAME && (flags & ISLASTCN)) { + error = VOP_ACCESS(vdp, VWRITE, cred); + if (error) + return (error); + /* + * Careful about locking second inode. + * This can only occur if the target is ".". + */ + if (dp->i_number == foundino) + return (EISDIR); + if (flags & ISDOTDOT) + VOP_UNLOCK(vdp); /* race to get the inode */ + error = VFS_VGET(vdp->v_mount, foundino, &tdp); + if (flags & ISDOTDOT) + vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY); + if (error) + return (error); + *vpp = tdp; + return (0); + } + + /* + * Step through the translation in the name. We do not `vput' the + * directory because we may need it again if a symbolic link + * is relative to the current directory. Instead we save it + * unlocked as "pdp". We must get the target inode before unlocking + * the directory to insure that the inode will not be removed + * before we get it. We prevent deadlock by always fetching + * inodes from the root, moving down the directory tree. Thus + * when following backward pointers ".." we must unlock the + * parent directory before getting the requested directory. + * There is a potential race condition here if both the current + * and parent directories are removed before the VFS_VGET for the + * inode associated with ".." returns. We hope that this occurs + * infrequently since we cannot avoid this race condition without + * implementing a sophisticated deadlock detection algorithm. + * Note also that this simple deadlock detection scheme will not + * work if the file system has any hard links other than ".." + * that point backwards in the directory structure. + */ + pdp = vdp; + if (flags & ISDOTDOT) { + VOP_UNLOCK(pdp); /* race to get the inode */ + error = VFS_VGET(vdp->v_mount, foundino, &tdp); + vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY); + if (error) { + return (error); + } + *vpp = tdp; + } else if (dp->i_number == foundino) { + vref(vdp); /* we want ourself, ie "." */ + *vpp = vdp; + } else { + error = VFS_VGET(vdp->v_mount, foundino, &tdp); + if (error) + return (error); + *vpp = tdp; + } + + /* + * Insert name into cache if appropriate. + */ + if (cnp->cn_flags & MAKEENTRY) + cache_enter(vdp, *vpp, cnp); + return (0); +} + +/* + * Do consistency checking on a directory entry: + * record length must be multiple of 4 + * entry must fit in rest of its dirblksize block + * record must be large enough to contain entry + * name is not longer than EXT2FS_MAXNAMLEN + * name must be as long as advertised, and null terminated + */ +/* + * changed so that it confirms to ext2fs_check_dir_entry + */ +static int +ext2fs_dirbadentry(struct vnode *dp, struct ext2fs_direct *de, + int entryoffsetinblock) +{ + struct ufsmount *ump = VFSTOUFS(dp->v_mount); + int dirblksiz = ump->um_dirblksiz; + + const char *error_msg = NULL; + int reclen = fs2h16(de->e2d_reclen); + int namlen = de->e2d_namlen; + + if (reclen < EXT2FS_DIRSIZ(1)) /* e2d_namlen = 1 */ + error_msg = "rec_len is smaller than minimal"; + else if (reclen % 4 != 0) + error_msg = "rec_len % 4 != 0"; + else if (namlen > EXT2FS_MAXNAMLEN) + error_msg = "namlen > EXT2FS_MAXNAMLEN"; + else if (reclen < EXT2FS_DIRSIZ(namlen)) + error_msg = "reclen is too small for name_len"; + else if (entryoffsetinblock + reclen > dirblksiz) + error_msg = "directory entry across blocks"; + else if (fs2h32(de->e2d_ino) > + VTOI(dp)->i_e2fs->e2fs.e2fs_icount) + error_msg = "inode out of bounds"; + + if (error_msg != NULL) { + printf( "bad directory entry: %s\n" + "offset=%d, inode=%lu, rec_len=%d, name_len=%d \n", + error_msg, entryoffsetinblock, + (unsigned long) fs2h32(de->e2d_ino), + reclen, namlen); + panic("ext2fs_dirbadentry"); + } + return error_msg == NULL ? 0 : 1; +} + +/* + * Write a directory entry after a call to namei, using the parameters + * that it left in nameidata. The argument ip is the inode which the new + * directory entry will refer to. Dvp is a pointer to the directory to + * be written, which was left locked by namei. Remaining parameters + * (ulr_offset, ulr_count) indicate how the space for the new + * entry is to be obtained. + */ +int +ext2fs_direnter(struct inode *ip, struct vnode *dvp, + const struct ufs_lookup_results *ulr, + struct componentname *cnp) +{ + struct ext2fs_direct *ep, *nep; + struct inode *dp; + struct buf *bp; + struct ext2fs_direct newdir; + struct iovec aiov; + struct uio auio; + u_int dsize; + int error, loc, newentrysize, spacefree; + char *dirbuf; + struct ufsmount *ump = VFSTOUFS(dvp->v_mount); + int dirblksiz = ump->um_dirblksiz; + + dp = VTOI(dvp); + + newdir.e2d_ino = h2fs32(ip->i_number); + newdir.e2d_namlen = cnp->cn_namelen; + if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 && + (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) { + newdir.e2d_type = inot2ext2dt(IFTODT(ip->i_e2fs_mode)); + } else { + newdir.e2d_type = 0; + } + memcpy(newdir.e2d_name, cnp->cn_nameptr, (unsigned)cnp->cn_namelen + 1); + newentrysize = EXT2FS_DIRSIZ(cnp->cn_namelen); + if (ulr->ulr_count == 0) { + /* + * If ulr_count is 0, then namei could find no + * space in the directory. Here, ulr_offset will + * be on a directory block boundary and we will write the + * new entry into a fresh block. + */ + if (ulr->ulr_offset & (dirblksiz - 1)) + panic("ext2fs_direnter: newblk"); + auio.uio_offset = ulr->ulr_offset; + newdir.e2d_reclen = h2fs16(dirblksiz); + auio.uio_resid = newentrysize; + aiov.iov_len = newentrysize; + aiov.iov_base = (void *)&newdir; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_WRITE; + UIO_SETUP_SYSSPACE(&auio); + error = VOP_WRITE(dvp, &auio, IO_SYNC, cnp->cn_cred); + if (dirblksiz > dvp->v_mount->mnt_stat.f_bsize) + /* XXX should grow with balloc() */ + panic("ext2fs_direnter: frag size"); + else if (!error) { + error = ext2fs_setsize(dp, + roundup(ext2fs_size(dp), dirblksiz)); + if (error) + return (error); + dp->i_flag |= IN_CHANGE; + uvm_vnp_setsize(dvp, ext2fs_size(dp)); + } + return (error); + } + + /* + * If ulr_count is non-zero, then namei found space + * for the new entry in the range ulr_offset to + * ulr_offset + ulr_count in the directory. + * To use this space, we may have to compact the entries located + * there, by copying them together towards the beginning of the + * block, leaving the free space in one usable chunk at the end. + */ + + /* + * Get the block containing the space for the new directory entry. + */ + if ((error = ext2fs_blkatoff(dvp, (off_t)ulr->ulr_offset, &dirbuf, &bp)) != 0) + return (error); + /* + * Find space for the new entry. In the simple case, the entry at + * offset base will have the space. If it does not, then namei + * arranged that compacting the region ulr_offset to + * ulr_offset + ulr_count would yield the + * space. + */ + ep = (struct ext2fs_direct *)dirbuf; + dsize = EXT2FS_DIRSIZ(ep->e2d_namlen); + spacefree = fs2h16(ep->e2d_reclen) - dsize; + for (loc = fs2h16(ep->e2d_reclen); loc < ulr->ulr_count; ) { + nep = (struct ext2fs_direct *)(dirbuf + loc); + if (ep->e2d_ino) { + /* trim the existing slot */ + ep->e2d_reclen = h2fs16(dsize); + ep = (struct ext2fs_direct *)((char *)ep + dsize); + } else { + /* overwrite; nothing there; header is ours */ + spacefree += dsize; + } + dsize = EXT2FS_DIRSIZ(nep->e2d_namlen); + spacefree += fs2h16(nep->e2d_reclen) - dsize; + loc += fs2h16(nep->e2d_reclen); + memcpy((void *)ep, (void *)nep, dsize); + } + /* + * Update the pointer fields in the previous entry (if any), + * copy in the new entry, and write out the block. + */ + if (ep->e2d_ino == 0) { +#ifdef DIAGNOSTIC + if (spacefree + dsize < newentrysize) + panic("ext2fs_direnter: compact1"); +#endif + newdir.e2d_reclen = h2fs16(spacefree + dsize); + } else { +#ifdef DIAGNOSTIC + if (spacefree < newentrysize) { + printf("ext2fs_direnter: compact2 %u %u", + (u_int)spacefree, (u_int)newentrysize); + panic("ext2fs_direnter: compact2"); + } +#endif + newdir.e2d_reclen = h2fs16(spacefree); + ep->e2d_reclen = h2fs16(dsize); + ep = (struct ext2fs_direct *)((char *)ep + dsize); + } + memcpy((void *)ep, (void *)&newdir, (u_int)newentrysize); + error = VOP_BWRITE(bp->b_vp, bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + if (!error && ulr->ulr_endoff && ulr->ulr_endoff < ext2fs_size(dp)) + error = ext2fs_truncate(dvp, (off_t)ulr->ulr_endoff, IO_SYNC, + cnp->cn_cred); + return (error); +} + +/* + * Remove a directory entry after a call to namei, using + * the auxiliary results it provided. The entry + * ulr_offset contains the offset into the directory of the + * entry to be eliminated. The ulr_count field contains the + * size of the previous record in the directory. If this + * is 0, the first entry is being deleted, so we need only + * zero the inode number to mark the entry as free. If the + * entry is not the first in the directory, we must reclaim + * the space of the now empty record by adding the record size + * to the size of the previous entry. + */ +int +ext2fs_dirremove(struct vnode *dvp, const struct ufs_lookup_results *ulr, + struct componentname *cnp) +{ + struct inode *dp; + struct ext2fs_direct *ep; + struct buf *bp; + int error; + + dp = VTOI(dvp); + + if (ulr->ulr_count == 0) { + /* + * First entry in block: set d_ino to zero. + */ + error = ext2fs_blkatoff(dvp, (off_t)ulr->ulr_offset, + (void *)&ep, &bp); + if (error != 0) + return (error); + ep->e2d_ino = 0; + error = VOP_BWRITE(bp->b_vp, bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + return (error); + } + /* + * Collapse new free space into previous entry. + */ + error = ext2fs_blkatoff(dvp, (off_t)(ulr->ulr_offset - ulr->ulr_count), + (void *)&ep, &bp); + if (error != 0) + return (error); + ep->e2d_reclen = h2fs16(fs2h16(ep->e2d_reclen) + ulr->ulr_reclen); + error = VOP_BWRITE(bp->b_vp, bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + return (error); +} + +/* + * Rewrite an existing directory entry to point at the inode + * supplied. The parameters describing the directory entry are + * set up by a call to namei. + */ +int +ext2fs_dirrewrite(struct inode *dp, const struct ufs_lookup_results *ulr, + struct inode *ip, struct componentname *cnp) +{ + struct buf *bp; + struct ext2fs_direct *ep; + struct vnode *vdp = ITOV(dp); + int error; + + error = ext2fs_blkatoff(vdp, (off_t)ulr->ulr_offset, (void *)&ep, &bp); + if (error != 0) + return (error); + ep->e2d_ino = h2fs32(ip->i_number); + if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 && + (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) { + ep->e2d_type = inot2ext2dt(IFTODT(ip->i_e2fs_mode)); + } else { + ep->e2d_type = 0; + } + error = VOP_BWRITE(bp->b_vp, bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + return (error); +} + +/* + * Check if a directory is empty or not. + * Inode supplied must be locked. + * + * Using a struct dirtemplate here is not precisely + * what we want, but better than using a struct ext2fs_direct. + * + * NB: does not handle corrupted directories. + */ +int +ext2fs_dirempty(struct inode *ip, ino_t parentino, kauth_cred_t cred) +{ + off_t off; + struct ext2fs_dirtemplate dbuf; + struct ext2fs_direct *dp = (struct ext2fs_direct *)&dbuf; + int error, namlen; + size_t count; + +#define MINDIRSIZ (sizeof (struct ext2fs_dirtemplate) / 2) + + for (off = 0; off < ext2fs_size(ip); off += fs2h16(dp->e2d_reclen)) { + error = vn_rdwr(UIO_READ, ITOV(ip), (void *)dp, MINDIRSIZ, off, + UIO_SYSSPACE, IO_NODELOCKED, cred, &count, NULL); + /* + * Since we read MINDIRSIZ, residual must + * be 0 unless we're at end of file. + */ + if (error || count != 0) + return (0); + /* avoid infinite loops */ + if (dp->e2d_reclen == 0) + return (0); + /* skip empty entries */ + if (dp->e2d_ino == 0) + continue; + /* accept only "." and ".." */ + namlen = dp->e2d_namlen; + if (namlen > 2) + return (0); + if (dp->e2d_name[0] != '.') + return (0); + /* + * At this point namlen must be 1 or 2. + * 1 implies ".", 2 implies ".." if second + * char is also "." + */ + if (namlen == 1) + continue; + if (dp->e2d_name[1] == '.' && fs2h32(dp->e2d_ino) == parentino) + continue; + return (0); + } + return (1); +} + +/* + * Check if source directory is in the path of the target directory. + * Target is supplied locked, source is unlocked. + * The target is always vput before returning. + */ +int +ext2fs_checkpath(struct inode *source, struct inode *target, + kauth_cred_t cred) +{ + struct vnode *vp; + int error, rootino, namlen; + struct ext2fs_dirtemplate dirbuf; + uint32_t ino; + + vp = ITOV(target); + if (target->i_number == source->i_number) { + error = EEXIST; + goto out; + } + rootino = ROOTINO; + error = 0; + if (target->i_number == rootino) + goto out; + + for (;;) { + if (vp->v_type != VDIR) { + error = ENOTDIR; + break; + } + error = vn_rdwr(UIO_READ, vp, (void *)&dirbuf, + sizeof (struct ext2fs_dirtemplate), (off_t)0, + UIO_SYSSPACE, IO_NODELOCKED, cred, (size_t *)0, + NULL); + if (error != 0) + break; + namlen = dirbuf.dotdot_namlen; + if (namlen != 2 || + dirbuf.dotdot_name[0] != '.' || + dirbuf.dotdot_name[1] != '.') { + error = ENOTDIR; + break; + } + ino = fs2h32(dirbuf.dotdot_ino); + if (ino == source->i_number) { + error = EINVAL; + break; + } + if (ino == rootino) + break; + vput(vp); + error = VFS_VGET(vp->v_mount, ino, &vp); + if (error != 0) { + vp = NULL; + break; + } + } + +out: + if (error == ENOTDIR) { + printf("checkpath: .. not a directory\n"); + panic("checkpath"); + } + if (vp != NULL) + vput(vp); + return (error); +} diff --git a/sys/ufs/ext2fs/ext2fs_readwrite.c b/sys/ufs/ext2fs/ext2fs_readwrite.c new file mode 100644 index 000000000..0b6f8d617 --- /dev/null +++ b/sys/ufs/ext2fs/ext2fs_readwrite.c @@ -0,0 +1,392 @@ +/* $NetBSD: ext2fs_readwrite.c,v 1.58 2011/11/18 21:18:51 christos Exp $ */ + +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_readwrite.c 8.8 (Berkeley) 8/4/94 + * Modified for ext2fs by Manuel Bouyer. + */ + +/*- + * Copyright (c) 1997 Manuel Bouyer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @(#)ufs_readwrite.c 8.8 (Berkeley) 8/4/94 + * Modified for ext2fs by Manuel Bouyer. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ext2fs_readwrite.c,v 1.58 2011/11/18 21:18:51 christos Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +#define doclusterread 0 /* XXX underway */ +#define doclusterwrite 0 + +/* + * Vnode op for reading. + */ +/* ARGSUSED */ +int +ext2fs_read(void *v) +{ + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp; + struct inode *ip; + struct uio *uio; + struct m_ext2fs *fs; + struct buf *bp; + struct ufsmount *ump; + vsize_t bytelen; + daddr_t lbn, nextlbn; + off_t bytesinfile; + long size, xfersize, blkoffset; + int error; + + vp = ap->a_vp; + ip = VTOI(vp); + ump = ip->i_ump; + uio = ap->a_uio; + error = 0; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ) + panic("%s: mode", "ext2fs_read"); + + if (vp->v_type == VLNK) { + if (ext2fs_size(ip) < ump->um_maxsymlinklen || + (ump->um_maxsymlinklen == 0 && ip->i_e2fs_nblock == 0)) + panic("%s: short symlink", "ext2fs_read"); + } else if (vp->v_type != VREG && vp->v_type != VDIR) + panic("%s: type %d", "ext2fs_read", vp->v_type); +#endif + fs = ip->i_e2fs; + if ((uint64_t)uio->uio_offset > ump->um_maxfilesize) + return (EFBIG); + if (uio->uio_resid == 0) + return (0); + if (uio->uio_offset >= ext2fs_size(ip)) + goto out; + + if (vp->v_type == VREG) { + const int advice = IO_ADV_DECODE(ap->a_ioflag); + + while (uio->uio_resid > 0) { + bytelen = MIN(ext2fs_size(ip) - uio->uio_offset, + uio->uio_resid); + if (bytelen == 0) + break; + + error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice, + UBC_READ | UBC_PARTIALOK | UBC_UNMAP_FLAG(vp)); + if (error) + break; + } + goto out; + } + + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { + bytesinfile = ext2fs_size(ip) - uio->uio_offset; + if (bytesinfile <= 0) + break; + lbn = lblkno(fs, uio->uio_offset); + nextlbn = lbn + 1; + size = fs->e2fs_bsize; + blkoffset = blkoff(fs, uio->uio_offset); + xfersize = fs->e2fs_bsize - blkoffset; + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + if (bytesinfile < xfersize) + xfersize = bytesinfile; + + if (lblktosize(fs, nextlbn) >= ext2fs_size(ip)) + error = bread(vp, lbn, size, NOCRED, 0, &bp); + else { + int nextsize = fs->e2fs_bsize; + error = breadn(vp, lbn, + size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp); + } + if (error) + break; + + /* + * We should only get non-zero b_resid when an I/O error + * has occurred, which should cause us to break above. + * However, if the short read did not cause an error, + * then we want to ensure that we do not uiomove bad + * or uninitialized data. + */ + size -= bp->b_resid; + if (size < xfersize) { + if (size == 0) + break; + xfersize = size; + } + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); + if (error) + break; + brelse(bp, 0); + } + if (bp != NULL) + brelse(bp, 0); + +out: + if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) { + ip->i_flag |= IN_ACCESS; + if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) + error = ext2fs_update(vp, NULL, NULL, UPDATE_WAIT); + } + return (error); +} + +/* + * Vnode op for writing. + */ +int +ext2fs_write(void *v) +{ + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp; + struct uio *uio; + struct inode *ip; + struct m_ext2fs *fs; + struct buf *bp; + struct ufsmount *ump; + daddr_t lbn; + off_t osize; + int blkoffset, error, flags, ioflag, resid, xfersize; + vsize_t bytelen; + off_t oldoff = 0; /* XXX */ + bool async; + int extended = 0; + int advice; + + ioflag = ap->a_ioflag; + advice = IO_ADV_DECODE(ioflag); + uio = ap->a_uio; + vp = ap->a_vp; + ip = VTOI(vp); + ump = ip->i_ump; + error = 0; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_WRITE) + panic("%s: mode", "ext2fs_write"); +#endif + + switch (vp->v_type) { + case VREG: + if (ioflag & IO_APPEND) + uio->uio_offset = ext2fs_size(ip); + if ((ip->i_e2fs_flags & EXT2_APPEND) && + uio->uio_offset != ext2fs_size(ip)) + return (EPERM); + /* FALLTHROUGH */ + case VLNK: + break; + case VDIR: + if ((ioflag & IO_SYNC) == 0) + panic("%s: nonsync dir write", "ext2fs_write"); + break; + default: + panic("%s: type", "ext2fs_write"); + } + + fs = ip->i_e2fs; + if (uio->uio_offset < 0 || + (uint64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize) + return (EFBIG); + if (uio->uio_resid == 0) + return (0); + + async = vp->v_mount->mnt_flag & MNT_ASYNC; + resid = uio->uio_resid; + osize = ext2fs_size(ip); + + if (vp->v_type == VREG) { + while (uio->uio_resid > 0) { + oldoff = uio->uio_offset; + blkoffset = blkoff(fs, uio->uio_offset); + bytelen = MIN(fs->e2fs_bsize - blkoffset, + uio->uio_resid); + + if (vp->v_size < oldoff + bytelen) { + uvm_vnp_setwritesize(vp, oldoff + bytelen); + } + error = ufs_balloc_range(vp, uio->uio_offset, + bytelen, ap->a_cred, 0); + if (error) + break; + error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice, + UBC_WRITE | UBC_UNMAP_FLAG(vp)); + if (error) + break; + + /* + * update UVM's notion of the size now that we've + * copied the data into the vnode's pages. + */ + + if (vp->v_size < uio->uio_offset) { + uvm_vnp_setsize(vp, uio->uio_offset); + extended = 1; + } + + /* + * flush what we just wrote if necessary. + * XXXUBC simplistic async flushing. + */ + + if (!async && oldoff >> 16 != uio->uio_offset >> 16) { + mutex_enter(vp->v_interlock); + error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16, + (uio->uio_offset >> 16) << 16, PGO_CLEANIT); + } + } + if (error == 0 && ioflag & IO_SYNC) { + mutex_enter(vp->v_interlock); + error = VOP_PUTPAGES(vp, trunc_page(oldoff), + round_page(blkroundup(fs, uio->uio_offset)), + PGO_CLEANIT | PGO_SYNCIO); + } + + goto out; + } + + flags = ioflag & IO_SYNC ? B_SYNC : 0; + for (error = 0; uio->uio_resid > 0;) { + lbn = lblkno(fs, uio->uio_offset); + blkoffset = blkoff(fs, uio->uio_offset); + xfersize = MIN(fs->e2fs_bsize - blkoffset, uio->uio_resid); + if (xfersize < fs->e2fs_bsize) + flags |= B_CLRBUF; + else + flags &= ~B_CLRBUF; + error = ext2fs_balloc(ip, + lbn, blkoffset + xfersize, ap->a_cred, &bp, flags); + if (error) + break; + if (ext2fs_size(ip) < uio->uio_offset + xfersize) { + error = ext2fs_setsize(ip, uio->uio_offset + xfersize); + if (error) + break; + } + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); + + /* + * update UVM's notion of the size now that we've + * copied the data into the vnode's pages. + */ + + if (vp->v_size < uio->uio_offset) { + uvm_vnp_setsize(vp, uio->uio_offset); + extended = 1; + } + + if (ioflag & IO_SYNC) + (void)bwrite(bp); + else if (xfersize + blkoffset == fs->e2fs_bsize) + bawrite(bp); + else + bdwrite(bp); + if (error || xfersize == 0) + break; + } + + /* + * If we successfully wrote any data, and we are not the superuser + * we clear the setuid and setgid bits as a precaution against + * tampering. + */ + +out: + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (vp->v_mount->mnt_flag & MNT_RELATIME) + ip->i_flag |= IN_ACCESS; + if (resid > uio->uio_resid && ap->a_cred && + kauth_authorize_generic(ap->a_cred, KAUTH_GENERIC_ISSUSER, NULL)) + ip->i_e2fs_mode &= ~(ISUID | ISGID); + if (resid > uio->uio_resid) + VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); + if (error) { + (void) ext2fs_truncate(vp, osize, ioflag & IO_SYNC, ap->a_cred); + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC) + error = ext2fs_update(vp, NULL, NULL, UPDATE_WAIT); + KASSERT(vp->v_size == ext2fs_size(ip)); + return (error); +} diff --git a/sys/ufs/ext2fs/ext2fs_subr.c b/sys/ufs/ext2fs/ext2fs_subr.c new file mode 100644 index 000000000..64f4c9f2c --- /dev/null +++ b/sys/ufs/ext2fs/ext2fs_subr.c @@ -0,0 +1,137 @@ +/* $NetBSD: ext2fs_subr.c,v 1.27 2009/10/19 18:41:17 bouyer Exp $ */ + +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_subr.c 8.2 (Berkeley) 9/21/93 + * Modified for ext2fs by Manuel Bouyer. + */ + +/* + * Copyright (c) 1997 Manuel Bouyer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @(#)ffs_subr.c 8.2 (Berkeley) 9/21/93 + * Modified for ext2fs by Manuel Bouyer. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ext2fs_subr.c,v 1.27 2009/10/19 18:41:17 bouyer Exp $"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Return buffer with the contents of block "offset" from the beginning of + * directory "ip". If "res" is non-zero, fill it in with a pointer to the + * remaining space in the directory. + */ +int +ext2fs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp) +{ + struct inode *ip; + struct m_ext2fs *fs; + struct buf *bp; + daddr_t lbn; + int error; + + ip = VTOI(vp); + fs = ip->i_e2fs; + lbn = lblkno(fs, offset); + + *bpp = NULL; + if ((error = bread(vp, lbn, fs->e2fs_bsize, NOCRED, 0, &bp)) != 0) { + brelse(bp, 0); + return (error); + } + if (res) + *res = (char *)bp->b_data + blkoff(fs, offset); + *bpp = bp; + return (0); +} + +void +ext2fs_itimes(struct inode *ip, const struct timespec *acc, + const struct timespec *mod, const struct timespec *cre) +{ + struct timespec now; + + if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) { + return; + } + + vfs_timestamp(&now); + if (ip->i_flag & IN_ACCESS) { + if (acc == NULL) + acc = &now; + ip->i_e2fs_atime = acc->tv_sec; + } + if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) { + if (mod == NULL) + mod = &now; + ip->i_e2fs_mtime = mod->tv_sec; + ip->i_modrev++; + } + if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) { + if (cre == NULL) + cre = &now; + ip->i_e2fs_ctime = cre->tv_sec; + } + if (ip->i_flag & (IN_ACCESS | IN_MODIFY)) + ip->i_flag |= IN_ACCESSED; + if (ip->i_flag & (IN_UPDATE | IN_CHANGE)) + ip->i_flag |= IN_MODIFIED; + ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY); +} diff --git a/sys/ufs/ext2fs/ext2fs_vfsops.c b/sys/ufs/ext2fs/ext2fs_vfsops.c new file mode 100644 index 000000000..76f6dd5a5 --- /dev/null +++ b/sys/ufs/ext2fs/ext2fs_vfsops.c @@ -0,0 +1,1266 @@ +/* $NetBSD: ext2fs_vfsops.c,v 1.162 2011/11/14 18:35:14 hannken Exp $ */ + +/* + * Copyright (c) 1989, 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_vfsops.c 8.14 (Berkeley) 11/28/94 + * Modified for ext2fs by Manuel Bouyer. + */ + +/* + * Copyright (c) 1997 Manuel Bouyer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @(#)ffs_vfsops.c 8.14 (Berkeley) 11/28/94 + * Modified for ext2fs by Manuel Bouyer. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ext2fs_vfsops.c,v 1.162 2011/11/14 18:35:14 hannken Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_compat_netbsd.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE(MODULE_CLASS_VFS, ext2fs, "ffs"); + +int ext2fs_sbupdate(struct ufsmount *, int); +static int ext2fs_checksb(struct ext2fs *, int); + +static struct sysctllog *ext2fs_sysctl_log; + +extern const struct vnodeopv_desc ext2fs_vnodeop_opv_desc; +extern const struct vnodeopv_desc ext2fs_specop_opv_desc; +extern const struct vnodeopv_desc ext2fs_fifoop_opv_desc; + +const struct vnodeopv_desc * const ext2fs_vnodeopv_descs[] = { + &ext2fs_vnodeop_opv_desc, + &ext2fs_specop_opv_desc, + &ext2fs_fifoop_opv_desc, + NULL, +}; + +struct vfsops ext2fs_vfsops = { + MOUNT_EXT2FS, + sizeof (struct ufs_args), + ext2fs_mount, + ufs_start, + ext2fs_unmount, + ufs_root, + ufs_quotactl, + ext2fs_statvfs, + ext2fs_sync, + ext2fs_vget, + ext2fs_fhtovp, + ext2fs_vptofh, + ext2fs_init, + ext2fs_reinit, + ext2fs_done, + ext2fs_mountroot, + (int (*)(struct mount *, struct vnode *, struct timespec *)) eopnotsupp, + vfs_stdextattrctl, + (void *)eopnotsupp, /* vfs_suspendctl */ + genfs_renamelock_enter, + genfs_renamelock_exit, + (void *)eopnotsupp, + ext2fs_vnodeopv_descs, + 0, + { NULL, NULL }, +}; + +static const struct genfs_ops ext2fs_genfsops = { + .gop_size = genfs_size, + .gop_alloc = ext2fs_gop_alloc, + .gop_write = genfs_gop_write, + .gop_markupdate = ufs_gop_markupdate, +}; + +static const struct ufs_ops ext2fs_ufsops = { + .uo_itimes = ext2fs_itimes, + .uo_update = ext2fs_update, + .uo_vfree = ext2fs_vfree, + .uo_unmark_vnode = (void (*)(vnode_t *))nullop, +}; + +/* Fill in the inode uid/gid from ext2 halves. */ +void +ext2fs_set_inode_guid(struct inode *ip) +{ + + ip->i_gid = ip->i_e2fs_gid; + ip->i_uid = ip->i_e2fs_uid; + if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) { + ip->i_gid |= ip->i_e2fs_gid_high << 16; + ip->i_uid |= ip->i_e2fs_uid_high << 16; + } +} + +static int +ext2fs_modcmd(modcmd_t cmd, void *arg) +{ + int error; + + switch (cmd) { + case MODULE_CMD_INIT: + error = vfs_attach(&ext2fs_vfsops); + if (error != 0) + break; + sysctl_createv(&ext2fs_sysctl_log, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "vfs", NULL, + NULL, 0, NULL, 0, + CTL_VFS, CTL_EOL); + sysctl_createv(&ext2fs_sysctl_log, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "ext2fs", + SYSCTL_DESCR("Linux EXT2FS file system"), + NULL, 0, NULL, 0, + CTL_VFS, 17, CTL_EOL); + /* + * XXX the "17" above could be dynamic, thereby eliminating + * one more instance of the "number to vfs" mapping problem, + * but "17" is the order as taken from sys/mount.h + */ + break; + case MODULE_CMD_FINI: + error = vfs_detach(&ext2fs_vfsops); + if (error != 0) + break; + sysctl_teardown(&ext2fs_sysctl_log); + break; + default: + error = ENOTTY; + break; + } + + return (error); +} + +/* + * XXX Same structure as FFS inodes? Should we share a common pool? + */ +struct pool ext2fs_inode_pool; +struct pool ext2fs_dinode_pool; + +extern u_long ext2gennumber; + +void +ext2fs_init(void) +{ + + pool_init(&ext2fs_inode_pool, sizeof(struct inode), 0, 0, 0, + "ext2fsinopl", &pool_allocator_nointr, IPL_NONE); + pool_init(&ext2fs_dinode_pool, sizeof(struct ext2fs_dinode), 0, 0, 0, + "ext2dinopl", &pool_allocator_nointr, IPL_NONE); + ufs_init(); +} + +void +ext2fs_reinit(void) +{ + ufs_reinit(); +} + +void +ext2fs_done(void) +{ + + ufs_done(); + pool_destroy(&ext2fs_inode_pool); + pool_destroy(&ext2fs_dinode_pool); +} + +/* + * Called by main() when ext2fs is going to be mounted as root. + * + * Name is updated by mount(8) after booting. + */ +#define ROOTNAME "root_device" + +int +ext2fs_mountroot(void) +{ + extern struct vnode *rootvp; + struct m_ext2fs *fs; + struct mount *mp; + struct ufsmount *ump; + int error; + + if (device_class(root_device) != DV_DISK) + return (ENODEV); + + if ((error = vfs_rootmountalloc(MOUNT_EXT2FS, "root_device", &mp))) { + vrele(rootvp); + return (error); + } + + if ((error = ext2fs_mountfs(rootvp, mp)) != 0) { + vfs_unbusy(mp, false, NULL); + vfs_destroy(mp); + return (error); + } + mutex_enter(&mountlist_lock); + CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mutex_exit(&mountlist_lock); + ump = VFSTOUFS(mp); + fs = ump->um_e2fs; + memset(fs->e2fs_fsmnt, 0, sizeof(fs->e2fs_fsmnt)); + (void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs_fsmnt, + sizeof(fs->e2fs_fsmnt) - 1, 0); + if (fs->e2fs.e2fs_rev > E2FS_REV0) { + memset(fs->e2fs.e2fs_fsmnt, 0, sizeof(fs->e2fs.e2fs_fsmnt)); + (void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs.e2fs_fsmnt, + sizeof(fs->e2fs.e2fs_fsmnt) - 1, 0); + } + (void)ext2fs_statvfs(mp, &mp->mnt_stat); + vfs_unbusy(mp, false, NULL); + setrootfstime((time_t)fs->e2fs.e2fs_wtime); + return (0); +} + +/* + * VFS Operations. + * + * mount system call + */ +int +ext2fs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) +{ + struct lwp *l = curlwp; + struct vnode *devvp; + struct ufs_args *args = data; + struct ufsmount *ump = NULL; + struct m_ext2fs *fs; + size_t size; + int error = 0, flags, update; + mode_t accessmode; + + if (*data_len < sizeof *args) + return EINVAL; + + if (mp->mnt_flag & MNT_GETARGS) { + ump = VFSTOUFS(mp); + if (ump == NULL) + return EIO; + memset(args, 0, sizeof *args); + args->fspec = NULL; + *data_len = sizeof *args; + return 0; + } + + update = mp->mnt_flag & MNT_UPDATE; + + /* Check arguments */ + if (args->fspec != NULL) { + /* + * Look up the name and verify that it's sane. + */ + error = namei_simple_user(args->fspec, + NSM_FOLLOW_NOEMULROOT, &devvp); + if (error != 0) + return (error); + + if (!update) { + /* + * Be sure this is a valid block device + */ + if (devvp->v_type != VBLK) + error = ENOTBLK; + else if (bdevsw_lookup(devvp->v_rdev) == NULL) + error = ENXIO; + } else { + /* + * Be sure we're still naming the same device + * used for our initial mount + */ + ump = VFSTOUFS(mp); + if (devvp != ump->um_devvp) { + if (devvp->v_rdev != ump->um_devvp->v_rdev) + error = EINVAL; + else { + vrele(devvp); + devvp = ump->um_devvp; + vref(devvp); + } + } + } + } else { + if (!update) { + /* New mounts must have a filename for the device */ + return (EINVAL); + } else { + ump = VFSTOUFS(mp); + devvp = ump->um_devvp; + vref(devvp); + } + } + + /* + * If mount by non-root, then verify that user has necessary + * permissions on the device. + * + * Permission to update a mount is checked higher, so here we presume + * updating the mount is okay (for example, as far as securelevel goes) + * which leaves us with the normal check. + */ + if (error == 0) { + accessmode = VREAD; + if (update ? + (mp->mnt_iflag & IMNT_WANTRDWR) != 0 : + (mp->mnt_flag & MNT_RDONLY) == 0) + accessmode |= VWRITE; + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = genfs_can_mount(devvp, accessmode, l->l_cred); + VOP_UNLOCK(devvp); + } + + if (error) { + vrele(devvp); + return (error); + } + + if (!update) { + int xflags; + + if (mp->mnt_flag & MNT_RDONLY) + xflags = FREAD; + else + xflags = FREAD|FWRITE; + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_OPEN(devvp, xflags, FSCRED); + VOP_UNLOCK(devvp); + if (error) + goto fail; + error = ext2fs_mountfs(devvp, mp); + if (error) { + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + (void)VOP_CLOSE(devvp, xflags, NOCRED); + VOP_UNLOCK(devvp); + goto fail; + } + + ump = VFSTOUFS(mp); + fs = ump->um_e2fs; + } else { + /* + * Update the mount. + */ + + /* + * The initial mount got a reference on this + * device, so drop the one obtained via + * namei(), above. + */ + vrele(devvp); + + ump = VFSTOUFS(mp); + fs = ump->um_e2fs; + if (fs->e2fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { + /* + * Changing from r/w to r/o + */ + flags = WRITECLOSE; + if (mp->mnt_flag & MNT_FORCE) + flags |= FORCECLOSE; + error = ext2fs_flushfiles(mp, flags); + if (error == 0 && + ext2fs_cgupdate(ump, MNT_WAIT) == 0 && + (fs->e2fs.e2fs_state & E2FS_ERRORS) == 0) { + fs->e2fs.e2fs_state = E2FS_ISCLEAN; + (void) ext2fs_sbupdate(ump, MNT_WAIT); + } + if (error) + return (error); + fs->e2fs_ronly = 1; + } + + if (mp->mnt_flag & MNT_RELOAD) { + error = ext2fs_reload(mp, l->l_cred, l); + if (error) + return (error); + } + + if (fs->e2fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) { + /* + * Changing from read-only to read/write + */ + fs->e2fs_ronly = 0; + if (fs->e2fs.e2fs_state == E2FS_ISCLEAN) + fs->e2fs.e2fs_state = 0; + else + fs->e2fs.e2fs_state = E2FS_ERRORS; + fs->e2fs_fmod = 1; + } + if (args->fspec == NULL) + return 0; + } + + error = set_statvfs_info(path, UIO_USERSPACE, args->fspec, + UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); + (void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs_fsmnt, + sizeof(fs->e2fs_fsmnt) - 1, &size); + memset(fs->e2fs_fsmnt + size, 0, sizeof(fs->e2fs_fsmnt) - size); + if (fs->e2fs.e2fs_rev > E2FS_REV0) { + (void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs.e2fs_fsmnt, + sizeof(fs->e2fs.e2fs_fsmnt) - 1, &size); + memset(fs->e2fs.e2fs_fsmnt, 0, + sizeof(fs->e2fs.e2fs_fsmnt) - size); + } + if (fs->e2fs_fmod != 0) { /* XXX */ + fs->e2fs_fmod = 0; + if (fs->e2fs.e2fs_state == 0) + fs->e2fs.e2fs_wtime = time_second; + else + printf("%s: file system not clean; please fsck(8)\n", + mp->mnt_stat.f_mntfromname); + (void) ext2fs_cgupdate(ump, MNT_WAIT); + } + return (error); + +fail: + vrele(devvp); + return (error); +} + +/* + * Reload all incore data for a filesystem (used after running fsck on + * the root filesystem and finding things to fix). The filesystem must + * be mounted read-only. + * + * Things to do to update the mount: + * 1) invalidate all cached meta-data. + * 2) re-read superblock from disk. + * 3) re-read summary information from disk. + * 4) invalidate all inactive vnodes. + * 5) invalidate all cached file data. + * 6) re-read inode data for all active vnodes. + */ +int +ext2fs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l) +{ + struct vnode *vp, *mvp, *devvp; + struct inode *ip; + struct buf *bp; + struct m_ext2fs *fs; + struct ext2fs *newfs; + int i, error; + void *cp; + struct ufsmount *ump; + + if ((mp->mnt_flag & MNT_RDONLY) == 0) + return (EINVAL); + + ump = VFSTOUFS(mp); + /* + * Step 1: invalidate all cached meta-data. + */ + devvp = ump->um_devvp; + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = vinvalbuf(devvp, 0, cred, l, 0, 0); + VOP_UNLOCK(devvp); + if (error) + panic("ext2fs_reload: dirty1"); + /* + * Step 2: re-read superblock from disk. + */ + error = bread(devvp, SBLOCK, SBSIZE, NOCRED, 0, &bp); + if (error) { + brelse(bp, 0); + return (error); + } + newfs = (struct ext2fs *)bp->b_data; + error = ext2fs_checksb(newfs, (mp->mnt_flag & MNT_RDONLY) != 0); + if (error) { + brelse(bp, 0); + return (error); + } + + fs = ump->um_e2fs; + /* + * copy in new superblock, and compute in-memory values + */ + e2fs_sbload(newfs, &fs->e2fs); + fs->e2fs_ncg = + howmany(fs->e2fs.e2fs_bcount - fs->e2fs.e2fs_first_dblock, + fs->e2fs.e2fs_bpg); + fs->e2fs_fsbtodb = fs->e2fs.e2fs_log_bsize + LOG_MINBSIZE - DEV_BSHIFT; + fs->e2fs_bsize = MINBSIZE << fs->e2fs.e2fs_log_bsize; + fs->e2fs_bshift = LOG_MINBSIZE + fs->e2fs.e2fs_log_bsize; + fs->e2fs_qbmask = fs->e2fs_bsize - 1; + fs->e2fs_bmask = ~fs->e2fs_qbmask; + fs->e2fs_ngdb = + howmany(fs->e2fs_ncg, fs->e2fs_bsize / sizeof(struct ext2_gd)); + fs->e2fs_ipb = fs->e2fs_bsize / EXT2_DINODE_SIZE(fs); + fs->e2fs_itpg = fs->e2fs.e2fs_ipg / fs->e2fs_ipb; + brelse(bp, 0); + + /* + * Step 3: re-read summary information from disk. + */ + + for (i = 0; i < fs->e2fs_ngdb; i++) { + error = bread(devvp , + fsbtodb(fs, fs->e2fs.e2fs_first_dblock + + 1 /* superblock */ + i), + fs->e2fs_bsize, NOCRED, 0, &bp); + if (error) { + brelse(bp, 0); + return (error); + } + e2fs_cgload((struct ext2_gd *)bp->b_data, + &fs->e2fs_gd[i * fs->e2fs_bsize / sizeof(struct ext2_gd)], + fs->e2fs_bsize); + brelse(bp, 0); + } + + /* Allocate a marker vnode. */ + mvp = vnalloc(mp); + /* + * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() + * and vclean() can be called indirectly + */ + mutex_enter(&mntvnode_lock); +loop: + for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) { + vmark(mvp, vp); + if (vp->v_mount != mp || vismarker(vp)) + continue; + /* + * Step 4: invalidate all inactive vnodes. + */ + if (vrecycle(vp, &mntvnode_lock, l)) { + mutex_enter(&mntvnode_lock); + (void)vunmark(mvp); + goto loop; + } + /* + * Step 5: invalidate all cached file data. + */ + mutex_enter(vp->v_interlock); + mutex_exit(&mntvnode_lock); + if (vget(vp, LK_EXCLUSIVE)) { + mutex_enter(&mntvnode_lock); + (void)vunmark(mvp); + goto loop; + } + if (vinvalbuf(vp, 0, cred, l, 0, 0)) + panic("ext2fs_reload: dirty2"); + /* + * Step 6: re-read inode data for all active vnodes. + */ + ip = VTOI(vp); + error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->e2fs_bsize, NOCRED, 0, &bp); + if (error) { + vput(vp); + mutex_enter(&mntvnode_lock); + (void)vunmark(mvp); + break; + } + cp = (char *)bp->b_data + + (ino_to_fsbo(fs, ip->i_number) * EXT2_DINODE_SIZE(fs)); + e2fs_iload((struct ext2fs_dinode *)cp, ip->i_din.e2fs_din); + ext2fs_set_inode_guid(ip); + brelse(bp, 0); + vput(vp); + mutex_enter(&mntvnode_lock); + } + mutex_exit(&mntvnode_lock); + vnfree(mvp); + return (error); +} + +/* + * Common code for mount and mountroot + */ +int +ext2fs_mountfs(struct vnode *devvp, struct mount *mp) +{ + struct lwp *l = curlwp; + struct ufsmount *ump; + struct buf *bp; + struct ext2fs *fs; + struct m_ext2fs *m_fs; + dev_t dev; + int error, i, ronly; + kauth_cred_t cred; + struct proc *p; + + dev = devvp->v_rdev; + p = l ? l->l_proc : NULL; + cred = l ? l->l_cred : NOCRED; + + /* Flush out any old buffers remaining from a previous use. */ + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0); + VOP_UNLOCK(devvp); + if (error) + return (error); + + ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + + bp = NULL; + ump = NULL; + +#ifdef DEBUG_EXT2 + printf("ext2 sb size: %zu\n", sizeof(struct ext2fs)); +#endif + error = bread(devvp, SBLOCK, SBSIZE, cred, 0, &bp); + if (error) + goto out; + fs = (struct ext2fs *)bp->b_data; + error = ext2fs_checksb(fs, ronly); + if (error) + goto out; + ump = malloc(sizeof(*ump), M_UFSMNT, M_WAITOK); + memset(ump, 0, sizeof(*ump)); + ump->um_fstype = UFS1; + ump->um_ops = &ext2fs_ufsops; + ump->um_e2fs = malloc(sizeof(struct m_ext2fs), M_UFSMNT, M_WAITOK); + memset(ump->um_e2fs, 0, sizeof(struct m_ext2fs)); + e2fs_sbload((struct ext2fs *)bp->b_data, &ump->um_e2fs->e2fs); + brelse(bp, 0); + bp = NULL; + m_fs = ump->um_e2fs; + m_fs->e2fs_ronly = ronly; + +#ifdef DEBUG_EXT2 + printf("ext2 ino size %zu\n", EXT2_DINODE_SIZE(m_fs)); +#endif + if (ronly == 0) { + if (m_fs->e2fs.e2fs_state == E2FS_ISCLEAN) + m_fs->e2fs.e2fs_state = 0; + else + m_fs->e2fs.e2fs_state = E2FS_ERRORS; + m_fs->e2fs_fmod = 1; + } + + /* compute dynamic sb infos */ + m_fs->e2fs_ncg = + howmany(m_fs->e2fs.e2fs_bcount - m_fs->e2fs.e2fs_first_dblock, + m_fs->e2fs.e2fs_bpg); + m_fs->e2fs_fsbtodb = m_fs->e2fs.e2fs_log_bsize + LOG_MINBSIZE - DEV_BSHIFT; + m_fs->e2fs_bsize = MINBSIZE << m_fs->e2fs.e2fs_log_bsize; + m_fs->e2fs_bshift = LOG_MINBSIZE + m_fs->e2fs.e2fs_log_bsize; + m_fs->e2fs_qbmask = m_fs->e2fs_bsize - 1; + m_fs->e2fs_bmask = ~m_fs->e2fs_qbmask; + m_fs->e2fs_ngdb = + howmany(m_fs->e2fs_ncg, m_fs->e2fs_bsize / sizeof(struct ext2_gd)); + m_fs->e2fs_ipb = m_fs->e2fs_bsize / EXT2_DINODE_SIZE(m_fs); + m_fs->e2fs_itpg = m_fs->e2fs.e2fs_ipg / m_fs->e2fs_ipb; + + m_fs->e2fs_gd = malloc(m_fs->e2fs_ngdb * m_fs->e2fs_bsize, + M_UFSMNT, M_WAITOK); + for (i = 0; i < m_fs->e2fs_ngdb; i++) { + error = bread(devvp , + fsbtodb(m_fs, m_fs->e2fs.e2fs_first_dblock + + 1 /* superblock */ + i), + m_fs->e2fs_bsize, NOCRED, 0, &bp); + if (error) { + free(m_fs->e2fs_gd, M_UFSMNT); + goto out; + } + e2fs_cgload((struct ext2_gd *)bp->b_data, + &m_fs->e2fs_gd[ + i * m_fs->e2fs_bsize / sizeof(struct ext2_gd)], + m_fs->e2fs_bsize); + brelse(bp, 0); + bp = NULL; + } + + mp->mnt_data = ump; + mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev; + mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_EXT2FS); + mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; + mp->mnt_stat.f_namemax = EXT2FS_MAXNAMLEN; + mp->mnt_flag |= MNT_LOCAL; + mp->mnt_dev_bshift = DEV_BSHIFT; /* XXX */ + mp->mnt_fs_bshift = m_fs->e2fs_bshift; + mp->mnt_iflag |= IMNT_DTYPE; + ump->um_flags = 0; + ump->um_mountp = mp; + ump->um_dev = dev; + ump->um_devvp = devvp; + ump->um_nindir = NINDIR(m_fs); + ump->um_lognindir = ffs(NINDIR(m_fs)) - 1; + ump->um_bptrtodb = m_fs->e2fs_fsbtodb; + ump->um_seqinc = 1; /* no frags */ + ump->um_maxsymlinklen = EXT2_MAXSYMLINKLEN; + ump->um_dirblksiz = m_fs->e2fs_bsize; + ump->um_maxfilesize = ((uint64_t)0x80000000 * m_fs->e2fs_bsize - 1); + devvp->v_specmountpoint = mp; + return (0); + +out: + KASSERT(bp != NULL); + brelse(bp, 0); + if (ump) { + free(ump->um_e2fs, M_UFSMNT); + free(ump, M_UFSMNT); + mp->mnt_data = NULL; + } + return (error); +} + +/* + * unmount system call + */ +int +ext2fs_unmount(struct mount *mp, int mntflags) +{ + struct ufsmount *ump; + struct m_ext2fs *fs; + int error, flags; + + flags = 0; + if (mntflags & MNT_FORCE) + flags |= FORCECLOSE; + if ((error = ext2fs_flushfiles(mp, flags)) != 0) + return (error); + ump = VFSTOUFS(mp); + fs = ump->um_e2fs; + if (fs->e2fs_ronly == 0 && + ext2fs_cgupdate(ump, MNT_WAIT) == 0 && + (fs->e2fs.e2fs_state & E2FS_ERRORS) == 0) { + fs->e2fs.e2fs_state = E2FS_ISCLEAN; + (void) ext2fs_sbupdate(ump, MNT_WAIT); + } + if (ump->um_devvp->v_type != VBAD) + ump->um_devvp->v_specmountpoint = NULL; + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_CLOSE(ump->um_devvp, fs->e2fs_ronly ? FREAD : FREAD|FWRITE, + NOCRED); + vput(ump->um_devvp); + free(fs->e2fs_gd, M_UFSMNT); + free(fs, M_UFSMNT); + free(ump, M_UFSMNT); + mp->mnt_data = NULL; + mp->mnt_flag &= ~MNT_LOCAL; + return (error); +} + +/* + * Flush out all the files in a filesystem. + */ +int +ext2fs_flushfiles(struct mount *mp, int flags) +{ + extern int doforce; + int error; + + if (!doforce) + flags &= ~FORCECLOSE; + error = vflush(mp, NULLVP, flags); + return (error); +} + +/* + * Get file system statistics. + */ +int +ext2fs_statvfs(struct mount *mp, struct statvfs *sbp) +{ + struct ufsmount *ump; + struct m_ext2fs *fs; + uint32_t overhead, overhead_per_group, ngdb; + int i, ngroups; + + ump = VFSTOUFS(mp); + fs = ump->um_e2fs; + if (fs->e2fs.e2fs_magic != E2FS_MAGIC) + panic("ext2fs_statvfs"); + + /* + * Compute the overhead (FS structures) + */ + overhead_per_group = + 1 /* block bitmap */ + + 1 /* inode bitmap */ + + fs->e2fs_itpg; + overhead = fs->e2fs.e2fs_first_dblock + + fs->e2fs_ncg * overhead_per_group; + if (fs->e2fs.e2fs_rev > E2FS_REV0 && + fs->e2fs.e2fs_features_rocompat & EXT2F_ROCOMPAT_SPARSESUPER) { + for (i = 0, ngroups = 0; i < fs->e2fs_ncg; i++) { + if (cg_has_sb(i)) + ngroups++; + } + } else { + ngroups = fs->e2fs_ncg; + } + ngdb = fs->e2fs_ngdb; + if (fs->e2fs.e2fs_rev > E2FS_REV0 && + fs->e2fs.e2fs_features_compat & EXT2F_COMPAT_RESIZE) + ngdb += fs->e2fs.e2fs_reserved_ngdb; + overhead += ngroups * (1 /* superblock */ + ngdb); + + sbp->f_bsize = fs->e2fs_bsize; + sbp->f_frsize = MINBSIZE << fs->e2fs.e2fs_fsize; + sbp->f_iosize = fs->e2fs_bsize; + sbp->f_blocks = fs->e2fs.e2fs_bcount - overhead; + sbp->f_bfree = fs->e2fs.e2fs_fbcount; + sbp->f_bresvd = fs->e2fs.e2fs_rbcount; + if (sbp->f_bfree > sbp->f_bresvd) + sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd; + else + sbp->f_bavail = 0; + sbp->f_files = fs->e2fs.e2fs_icount; + sbp->f_ffree = fs->e2fs.e2fs_ficount; + sbp->f_favail = fs->e2fs.e2fs_ficount; + sbp->f_fresvd = 0; + copy_statvfs_info(sbp, mp); + return (0); +} + +/* + * Go through the disk queues to initiate sandbagged IO; + * go through the inodes to write those that have been modified; + * initiate the writing of the super block if it has been modified. + * + * Note: we are always called with the filesystem marked `MPBUSY'. + */ +int +ext2fs_sync(struct mount *mp, int waitfor, kauth_cred_t cred) +{ + struct vnode *vp, *mvp; + struct inode *ip; + struct ufsmount *ump = VFSTOUFS(mp); + struct m_ext2fs *fs; + int error, allerror = 0; + + fs = ump->um_e2fs; + if (fs->e2fs_fmod != 0 && fs->e2fs_ronly != 0) { /* XXX */ + printf("fs = %s\n", fs->e2fs_fsmnt); + panic("update: rofs mod"); + } + + /* Allocate a marker vnode. */ + mvp = vnalloc(mp); + + /* + * Write back each (modified) inode. + */ + mutex_enter(&mntvnode_lock); +loop: + /* + * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() + * and vclean() can be called indirectly + */ + for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) { + vmark(mvp, vp); + if (vp->v_mount != mp || vismarker(vp)) + continue; + mutex_enter(vp->v_interlock); + ip = VTOI(vp); + if (ip == NULL || (vp->v_iflag & (VI_XLOCK|VI_CLEAN)) != 0 || + vp->v_type == VNON || + ((ip->i_flag & + (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 && + LIST_EMPTY(&vp->v_dirtyblkhd) && + UVM_OBJ_IS_CLEAN(&vp->v_uobj))) + { + mutex_exit(vp->v_interlock); + continue; + } + mutex_exit(&mntvnode_lock); + error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT); + if (error) { + mutex_enter(&mntvnode_lock); + if (error == ENOENT) { + mutex_enter(&mntvnode_lock); + (void)vunmark(mvp); + goto loop; + } + continue; + } + if (vp->v_type == VREG && waitfor == MNT_LAZY) + error = ext2fs_update(vp, NULL, NULL, 0); + else + error = VOP_FSYNC(vp, cred, + waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0); + if (error) + allerror = error; + vput(vp); + mutex_enter(&mntvnode_lock); + } + mutex_exit(&mntvnode_lock); + vnfree(mvp); + /* + * Force stale file system control information to be flushed. + */ + if (waitfor != MNT_LAZY) { + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); + if ((error = VOP_FSYNC(ump->um_devvp, cred, + waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0)) != 0) + allerror = error; + VOP_UNLOCK(ump->um_devvp); + } + /* + * Write back modified superblock. + */ + if (fs->e2fs_fmod != 0) { + fs->e2fs_fmod = 0; + fs->e2fs.e2fs_wtime = time_second; + if ((error = ext2fs_cgupdate(ump, waitfor))) + allerror = error; + } + return (allerror); +} + +/* + * Look up a EXT2FS dinode number to find its incore vnode, otherwise read it + * in from disk. If it is in core, wait for the lock bit to clear, then + * return the inode locked. Detection and handling of mount points must be + * done by the calling routine. + */ +int +ext2fs_vget(struct mount *mp, ino_t ino, struct vnode **vpp) +{ + struct m_ext2fs *fs; + struct inode *ip; + struct ufsmount *ump; + struct buf *bp; + struct vnode *vp; + dev_t dev; + int error; + void *cp; + + ump = VFSTOUFS(mp); + dev = ump->um_dev; +retry: + if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL) + return (0); + + /* Allocate a new vnode/inode. */ + error = getnewvnode(VT_EXT2FS, mp, ext2fs_vnodeop_p, NULL, &vp); + if (error) { + *vpp = NULL; + return (error); + } + ip = pool_get(&ext2fs_inode_pool, PR_WAITOK); + + mutex_enter(&ufs_hashlock); + if ((*vpp = ufs_ihashget(dev, ino, 0)) != NULL) { + mutex_exit(&ufs_hashlock); + ungetnewvnode(vp); + pool_put(&ext2fs_inode_pool, ip); + goto retry; + } + + vp->v_vflag |= VV_LOCKSWORK; + + memset(ip, 0, sizeof(struct inode)); + vp->v_data = ip; + ip->i_vnode = vp; + ip->i_ump = ump; + ip->i_e2fs = fs = ump->um_e2fs; + ip->i_dev = dev; + ip->i_number = ino; + ip->i_e2fs_last_lblk = 0; + ip->i_e2fs_last_blk = 0; + genfs_node_init(vp, &ext2fs_genfsops); + + /* + * Put it onto its hash chain and lock it so that other requests for + * this inode will block if they arrive while we are sleeping waiting + * for old data structures to be purged or for the contents of the + * disk portion of this inode to be read. + */ + + ufs_ihashins(ip); + mutex_exit(&ufs_hashlock); + + /* Read in the disk contents for the inode, copy into the inode. */ + error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), + (int)fs->e2fs_bsize, NOCRED, 0, &bp); + if (error) { + + /* + * The inode does not contain anything useful, so it would + * be misleading to leave it on its hash chain. With mode + * still zero, it will be unlinked and returned to the free + * list by vput(). + */ + + vput(vp); + brelse(bp, 0); + *vpp = NULL; + return (error); + } + cp = (char *)bp->b_data + (ino_to_fsbo(fs, ino) * EXT2_DINODE_SIZE(fs)); + ip->i_din.e2fs_din = pool_get(&ext2fs_dinode_pool, PR_WAITOK); + e2fs_iload((struct ext2fs_dinode *)cp, ip->i_din.e2fs_din); + ext2fs_set_inode_guid(ip); + brelse(bp, 0); + + /* If the inode was deleted, reset all fields */ + if (ip->i_e2fs_dtime != 0) { + ip->i_e2fs_mode = ip->i_e2fs_nblock = 0; + (void)ext2fs_setsize(ip, 0); + memset(ip->i_e2fs_blocks, 0, sizeof(ip->i_e2fs_blocks)); + } + + /* + * Initialize the vnode from the inode, check for aliases. + */ + + error = ext2fs_vinit(mp, ext2fs_specop_p, ext2fs_fifoop_p, &vp); + if (error) { + vput(vp); + *vpp = NULL; + return (error); + } + /* + * Finish inode initialization now that aliasing has been resolved. + */ + + ip->i_devvp = ump->um_devvp; + vref(ip->i_devvp); + + /* + * Set up a generation number for this inode if it does not + * already have one. This should only happen on old filesystems. + */ + + if (ip->i_e2fs_gen == 0) { + if (++ext2gennumber < (u_long)time_second) + ext2gennumber = time_second; + ip->i_e2fs_gen = ext2gennumber; + if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) + ip->i_flag |= IN_MODIFIED; + } + uvm_vnp_setsize(vp, ext2fs_size(ip)); + *vpp = vp; + return (0); +} + +/* + * File handle to vnode + * + * Have to be really careful about stale file handles: + * - check that the inode number is valid + * - call ext2fs_vget() to get the locked inode + * - check for an unallocated inode (i_mode == 0) + */ +int +ext2fs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp) +{ + struct inode *ip; + struct vnode *nvp; + int error; + struct ufid ufh; + struct m_ext2fs *fs; + + if (fhp->fid_len != sizeof(struct ufid)) + return EINVAL; + + memcpy(&ufh, fhp, sizeof(struct ufid)); + fs = VFSTOUFS(mp)->um_e2fs; + if ((ufh.ufid_ino < EXT2_FIRSTINO && ufh.ufid_ino != EXT2_ROOTINO) || + ufh.ufid_ino >= fs->e2fs_ncg * fs->e2fs.e2fs_ipg) + return (ESTALE); + + if ((error = VFS_VGET(mp, ufh.ufid_ino, &nvp)) != 0) { + *vpp = NULLVP; + return (error); + } + ip = VTOI(nvp); + if (ip->i_e2fs_mode == 0 || ip->i_e2fs_dtime != 0 || + ip->i_e2fs_gen != ufh.ufid_gen) { + vput(nvp); + *vpp = NULLVP; + return (ESTALE); + } + *vpp = nvp; + return (0); +} + +/* + * Vnode pointer to File handle + */ +/* ARGSUSED */ +int +ext2fs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size) +{ + struct inode *ip; + struct ufid ufh; + + if (*fh_size < sizeof(struct ufid)) { + *fh_size = sizeof(struct ufid); + return E2BIG; + } + *fh_size = sizeof(struct ufid); + + ip = VTOI(vp); + memset(&ufh, 0, sizeof(ufh)); + ufh.ufid_len = sizeof(struct ufid); + ufh.ufid_ino = ip->i_number; + ufh.ufid_gen = ip->i_e2fs_gen; + memcpy(fhp, &ufh, sizeof(ufh)); + return (0); +} + +/* + * Write a superblock and associated information back to disk. + */ +int +ext2fs_sbupdate(struct ufsmount *mp, int waitfor) +{ + struct m_ext2fs *fs = mp->um_e2fs; + struct buf *bp; + int error = 0; + + bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0); + e2fs_sbsave(&fs->e2fs, (struct ext2fs*)bp->b_data); + if (waitfor == MNT_WAIT) + error = bwrite(bp); + else + bawrite(bp); + return (error); +} + +int +ext2fs_cgupdate(struct ufsmount *mp, int waitfor) +{ + struct m_ext2fs *fs = mp->um_e2fs; + struct buf *bp; + int i, error = 0, allerror = 0; + + allerror = ext2fs_sbupdate(mp, waitfor); + for (i = 0; i < fs->e2fs_ngdb; i++) { + bp = getblk(mp->um_devvp, fsbtodb(fs, + fs->e2fs.e2fs_first_dblock + + 1 /* superblock */ + i), fs->e2fs_bsize, 0, 0); + e2fs_cgsave(&fs->e2fs_gd[ + i * fs->e2fs_bsize / sizeof(struct ext2_gd)], + (struct ext2_gd *)bp->b_data, fs->e2fs_bsize); + if (waitfor == MNT_WAIT) + error = bwrite(bp); + else + bawrite(bp); + } + + if (!allerror && error) + allerror = error; + return (allerror); +} + +static int +ext2fs_checksb(struct ext2fs *fs, int ronly) +{ + + if (fs2h16(fs->e2fs_magic) != E2FS_MAGIC) { + return (EINVAL); /* XXX needs translation */ + } + if (fs2h32(fs->e2fs_rev) > E2FS_REV1) { +#ifdef DIAGNOSTIC + printf("Ext2 fs: unsupported revision number: %x\n", + fs2h32(fs->e2fs_rev)); +#endif + return (EINVAL); /* XXX needs translation */ + } + if (fs2h32(fs->e2fs_log_bsize) > 2) { /* block size = 1024|2048|4096 */ +#ifdef DIAGNOSTIC + printf("Ext2 fs: bad block size: %d " + "(expected <= 2 for ext2 fs)\n", + fs2h32(fs->e2fs_log_bsize)); +#endif + return (EINVAL); /* XXX needs translation */ + } + if (fs2h32(fs->e2fs_rev) > E2FS_REV0) { + if (fs2h32(fs->e2fs_first_ino) != EXT2_FIRSTINO) { + printf("Ext2 fs: unsupported first inode position\n"); + return (EINVAL); /* XXX needs translation */ + } + if (fs2h32(fs->e2fs_features_incompat) & + ~EXT2F_INCOMPAT_SUPP) { + printf("Ext2 fs: unsupported optional feature\n"); + return (EINVAL); /* XXX needs translation */ + } + if (!ronly && fs2h32(fs->e2fs_features_rocompat) & + ~EXT2F_ROCOMPAT_SUPP) { + return (EROFS); /* XXX needs translation */ + } + } + return (0); +} diff --git a/sys/ufs/ext2fs/ext2fs_vnops.c b/sys/ufs/ext2fs/ext2fs_vnops.c new file mode 100644 index 000000000..0ea5dd35c --- /dev/null +++ b/sys/ufs/ext2fs/ext2fs_vnops.c @@ -0,0 +1,1664 @@ +/* $NetBSD: ext2fs_vnops.c,v 1.101 2011/11/18 21:18:51 christos Exp $ */ + +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_vnops.c 8.14 (Berkeley) 10/26/94 + * Modified for ext2fs by Manuel Bouyer. + */ + +/* + * Copyright (c) 1997 Manuel Bouyer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @(#)ufs_vnops.c 8.14 (Berkeley) 10/26/94 + * Modified for ext2fs by Manuel Bouyer. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ext2fs_vnops.c,v 1.101 2011/11/18 21:18:51 christos Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +extern int prtactive; + +static int ext2fs_chmod(struct vnode *, int, kauth_cred_t, struct lwp *); +static int ext2fs_chown(struct vnode *, uid_t, gid_t, kauth_cred_t, + struct lwp *); + +union _qcvt { + int64_t qcvt; + int32_t val[2]; +}; + +#define SETHIGH(q, h) { \ + union _qcvt tmp; \ + tmp.qcvt = (q); \ + tmp.val[_QUAD_HIGHWORD] = (h); \ + (q) = tmp.qcvt; \ +} +#define SETLOW(q, l) { \ + union _qcvt tmp; \ + tmp.qcvt = (q); \ + tmp.val[_QUAD_LOWWORD] = (l); \ + (q) = tmp.qcvt; \ +} + +/* + * Create a regular file + */ +int +ext2fs_create(void *v) +{ + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap = v; + int error; + + error = + ext2fs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), + ap->a_dvp, ap->a_vpp, ap->a_cnp); + + if (error) + return (error); + VN_KNOTE(ap->a_dvp, NOTE_WRITE); + return (0); +} + +/* + * Mknod vnode call + */ +/* ARGSUSED */ +int +ext2fs_mknod(void *v) +{ + struct vop_mknod_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap = v; + struct vattr *vap = ap->a_vap; + struct vnode **vpp = ap->a_vpp; + struct inode *ip; + int error; + struct mount *mp; + ino_t ino; + + if ((error = ext2fs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), + ap->a_dvp, vpp, ap->a_cnp)) != 0) + return (error); + VN_KNOTE(ap->a_dvp, NOTE_WRITE); + ip = VTOI(*vpp); + mp = (*vpp)->v_mount; + ino = ip->i_number; + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + if (vap->va_rdev != VNOVAL) { + /* + * Want to be able to use this to make badblock + * inodes, so don't truncate the dev number. + */ + ip->i_din.e2fs_din->e2di_rdev = h2fs32(vap->va_rdev); + } + /* + * Remove inode so that it will be reloaded by VFS_VGET and + * checked to see if it is an alias of an existing entry in + * the inode cache. + */ + VOP_UNLOCK(*vpp); + (*vpp)->v_type = VNON; + vgone(*vpp); + error = VFS_VGET(mp, ino, vpp); + if (error != 0) { + *vpp = NULL; + return (error); + } + return (0); +} + +/* + * Open called. + * + * Just check the APPEND flag. + */ +/* ARGSUSED */ +int +ext2fs_open(void *v) +{ + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + kauth_cred_t a_cred; + } */ *ap = v; + + /* + * Files marked append-only must be opened for appending. + */ + if ((VTOI(ap->a_vp)->i_e2fs_flags & EXT2_APPEND) && + (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) + return (EPERM); + return (0); +} + +static int +ext2fs_check_possible(struct vnode *vp, struct inode *ip, mode_t mode) +{ + + /* + * Disallow write attempts on read-only file systems; + * unless the file is a socket, fifo, or a block or + * character device resident on the file system. + */ + if (mode & VWRITE) { + switch (vp->v_type) { + case VDIR: + case VLNK: + case VREG: + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + break; + default: + break; + } + } + + /* If immutable bit set, nobody gets to write it. */ + if ((mode & VWRITE) && (ip->i_e2fs_flags & EXT2_IMMUTABLE)) + return (EPERM); + + return 0; +} + +static int +ext2fs_check_permitted(struct vnode *vp, struct inode *ip, mode_t mode, + kauth_cred_t cred) +{ + + return genfs_can_access(vp->v_type, ip->i_e2fs_mode & ALLPERMS, + ip->i_uid, ip->i_gid, mode, cred); +} + +int +ext2fs_access(void *v) +{ + struct vop_access_args /* { + struct vnode *a_vp; + int a_mode; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + mode_t mode = ap->a_mode; + int error; + + error = ext2fs_check_possible(vp, ip, mode); + if (error) + return error; + + error = ext2fs_check_permitted(vp, ip, mode, ap->a_cred); + + return error; +} + +/* ARGSUSED */ +int +ext2fs_getattr(void *v) +{ + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct vattr *vap = ap->a_vap; + + EXT2FS_ITIMES(ip, NULL, NULL, NULL); + /* + * Copy from inode table + */ + vap->va_fsid = ip->i_dev; + vap->va_fileid = ip->i_number; + vap->va_mode = ip->i_e2fs_mode & ALLPERMS; + vap->va_nlink = ip->i_e2fs_nlink; + vap->va_uid = ip->i_uid; + vap->va_gid = ip->i_gid; + vap->va_rdev = (dev_t)fs2h32(ip->i_din.e2fs_din->e2di_rdev); + vap->va_size = vp->v_size; + vap->va_atime.tv_sec = ip->i_e2fs_atime; + vap->va_atime.tv_nsec = 0; + vap->va_mtime.tv_sec = ip->i_e2fs_mtime; + vap->va_mtime.tv_nsec = 0; + vap->va_ctime.tv_sec = ip->i_e2fs_ctime; + vap->va_ctime.tv_nsec = 0; +#ifdef EXT2FS_SYSTEM_FLAGS + vap->va_flags = (ip->i_e2fs_flags & EXT2_APPEND) ? SF_APPEND : 0; + vap->va_flags |= (ip->i_e2fs_flags & EXT2_IMMUTABLE) ? SF_IMMUTABLE : 0; +#else + vap->va_flags = (ip->i_e2fs_flags & EXT2_APPEND) ? UF_APPEND : 0; + vap->va_flags |= (ip->i_e2fs_flags & EXT2_IMMUTABLE) ? UF_IMMUTABLE : 0; +#endif + vap->va_gen = ip->i_e2fs_gen; + /* this doesn't belong here */ + if (vp->v_type == VBLK) + vap->va_blocksize = BLKDEV_IOSIZE; + else if (vp->v_type == VCHR) + vap->va_blocksize = MAXBSIZE; + else + vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; + vap->va_bytes = dbtob((u_quad_t)ip->i_e2fs_nblock); + vap->va_type = vp->v_type; + vap->va_filerev = ip->i_modrev; + return (0); +} + +/* + * Set attribute vnode op. called from several syscalls + */ +int +ext2fs_setattr(void *v) +{ + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + kauth_cred_t a_cred; + } */ *ap = v; + struct vattr *vap = ap->a_vap; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + kauth_cred_t cred = ap->a_cred; + struct lwp *l = curlwp; + int error; + + /* + * Check for unsettable attributes. + */ + if ((vap->va_type != VNON) || (vap->va_nlink != (nlink_t)VNOVAL) || + (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || + (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || + ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { + return (EINVAL); + } + if (vap->va_flags != VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if (kauth_cred_geteuid(cred) != ip->i_uid && + (error = kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, + NULL))) + return (error); +#ifdef EXT2FS_SYSTEM_FLAGS + if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, + NULL) == 0) { + if ((ip->i_e2fs_flags & + (EXT2_APPEND | EXT2_IMMUTABLE)) && + kauth_authorize_system(l->l_cred, + KAUTH_SYSTEM_CHSYSFLAGS, 0, NULL, NULL, NULL)) + return (EPERM); + ip->i_e2fs_flags &= ~(EXT2_APPEND | EXT2_IMMUTABLE); + ip->i_e2fs_flags |= + (vap->va_flags & SF_APPEND) ? EXT2_APPEND : 0 | + (vap->va_flags & SF_IMMUTABLE) ? EXT2_IMMUTABLE : 0; + } else + return (EPERM); +#else + ip->i_e2fs_flags &= ~(EXT2_APPEND | EXT2_IMMUTABLE); + ip->i_e2fs_flags |= + (vap->va_flags & UF_APPEND) ? EXT2_APPEND : 0 | + (vap->va_flags & UF_IMMUTABLE) ? EXT2_IMMUTABLE : 0; +#endif + ip->i_flag |= IN_CHANGE; + if (vap->va_flags & (IMMUTABLE | APPEND)) + return (0); + } + if (ip->i_e2fs_flags & (EXT2_APPEND | EXT2_IMMUTABLE)) + return (EPERM); + /* + * Go through the fields and update iff not VNOVAL. + */ + if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + error = ext2fs_chown(vp, vap->va_uid, vap->va_gid, cred, l); + if (error) + return (error); + } + if (vap->va_size != VNOVAL) { + /* + * Disallow write attempts on read-only file systems; + * unless the file is a socket, fifo, or a block or + * character device resident on the file system. + */ + switch (vp->v_type) { + case VDIR: + return (EISDIR); + case VLNK: + case VREG: + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + default: + break; + } + error = ext2fs_truncate(vp, vap->va_size, 0, cred); + if (error) + return (error); + } + ip = VTOI(vp); + if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + error = genfs_can_chtimes(vp, vap->va_vaflags, ip->i_uid, cred); + if (error) + return (error); + if (vap->va_atime.tv_sec != VNOVAL) + if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) + ip->i_flag |= IN_ACCESS; + if (vap->va_mtime.tv_sec != VNOVAL) { + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (vp->v_mount->mnt_flag & MNT_RELATIME) + ip->i_flag |= IN_ACCESS; + } + error = ext2fs_update(vp, &vap->va_atime, &vap->va_mtime, + UPDATE_WAIT); + if (error) + return (error); + } + error = 0; + if (vap->va_mode != (mode_t)VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + error = ext2fs_chmod(vp, (int)vap->va_mode, cred, l); + } + VN_KNOTE(vp, NOTE_ATTRIB); + return (error); +} + +/* + * Change the mode on a file. + * Inode must be locked before calling. + */ +static int +ext2fs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l) +{ + struct inode *ip = VTOI(vp); + int error; + + error = genfs_can_chmod(vp, cred, ip->i_uid, ip->i_gid, mode); + if (error) + return (error); + + ip->i_e2fs_mode &= ~ALLPERMS; + ip->i_e2fs_mode |= (mode & ALLPERMS); + ip->i_flag |= IN_CHANGE; + return (0); +} + +/* + * Perform chown operation on inode ip; + * inode must be locked prior to call. + */ +static int +ext2fs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, + struct lwp *l) +{ + struct inode *ip = VTOI(vp); + uid_t ouid; + gid_t ogid; + int error; + + if (uid == (uid_t)VNOVAL) + uid = ip->i_uid; + if (gid == (gid_t)VNOVAL) + gid = ip->i_gid; + + error = genfs_can_chown(vp, cred, ip->i_uid, ip->i_gid, uid, gid); + if (error) + return (error); + + ogid = ip->i_gid; + ouid = ip->i_uid; + + ip->i_e2fs_gid = gid & 0xffff; + ip->i_e2fs_uid = uid & 0xffff; + if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) { + ip->i_e2fs_gid_high = (gid >> 16) & 0xffff; + ip->i_e2fs_uid_high = (uid >> 16) & 0xffff; + } else { + ip->i_e2fs_gid_high = 0; + ip->i_e2fs_uid_high = 0; + } + if (ouid != uid || ogid != gid) { + ext2fs_set_inode_guid(ip); + ip->i_flag |= IN_CHANGE; + } + if (ouid != uid && kauth_authorize_generic(cred, + KAUTH_GENERIC_ISSUSER, NULL) != 0) + ip->i_e2fs_mode &= ~ISUID; + if (ogid != gid && kauth_authorize_generic(cred, + KAUTH_GENERIC_ISSUSER, NULL) != 0) + ip->i_e2fs_mode &= ~ISGID; + return (0); +} + +int +ext2fs_remove(void *v) +{ + struct vop_remove_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap = v; + struct inode *ip; + struct vnode *vp = ap->a_vp; + struct vnode *dvp = ap->a_dvp; + struct ufs_lookup_results *ulr; + int error; + + /* XXX should handle this material another way */ + ulr = &VTOI(dvp)->i_crap; + UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); + + ip = VTOI(vp); + if (vp->v_type == VDIR || + (ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) || + (VTOI(dvp)->i_e2fs_flags & EXT2_APPEND)) { + error = EPERM; + } else { + error = ext2fs_dirremove(dvp, ulr, ap->a_cnp); + if (error == 0) { + ip->i_e2fs_nlink--; + ip->i_flag |= IN_CHANGE; + } + } + + VN_KNOTE(vp, NOTE_DELETE); + VN_KNOTE(dvp, NOTE_WRITE); + if (dvp == vp) + vrele(vp); + else + vput(vp); + vput(dvp); + return (error); +} + +/* + * ext2fs_link: create hard link. + */ +int +ext2fs_link(void *v) +{ + struct vop_link_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap = v; + struct vnode *dvp = ap->a_dvp; + struct vnode *vp = ap->a_vp; + struct componentname *cnp = ap->a_cnp; + struct inode *ip; + int error; + struct ufs_lookup_results *ulr; + + KASSERT(dvp != vp); + KASSERT(vp->v_type != VDIR); + KASSERT(dvp->v_mount == vp->v_mount); + + /* XXX should handle this material another way */ + ulr = &VTOI(dvp)->i_crap; + UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); + + error = vn_lock(vp, LK_EXCLUSIVE); + if (error) { + VOP_ABORTOP(dvp, cnp); + goto out2; + } + ip = VTOI(vp); + if ((nlink_t)ip->i_e2fs_nlink >= LINK_MAX) { + VOP_ABORTOP(dvp, cnp); + error = EMLINK; + goto out1; + } + if (ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) { + VOP_ABORTOP(dvp, cnp); + error = EPERM; + goto out1; + } + ip->i_e2fs_nlink++; + ip->i_flag |= IN_CHANGE; + error = ext2fs_update(vp, NULL, NULL, UPDATE_WAIT); + if (!error) + error = ext2fs_direnter(ip, dvp, ulr, cnp); + if (error) { + ip->i_e2fs_nlink--; + ip->i_flag |= IN_CHANGE; + } +out1: + VOP_UNLOCK(vp); +out2: + VN_KNOTE(vp, NOTE_LINK); + VN_KNOTE(dvp, NOTE_WRITE); + vput(dvp); + return (error); +} + +/* + * Rename system call. + * rename("foo", "bar"); + * is essentially + * unlink("bar"); + * link("foo", "bar"); + * unlink("foo"); + * but ``atomically''. Can't do full commit without saving state in the + * inode on disk which isn't feasible at this time. Best we can do is + * always guarantee the target exists. + * + * Basic algorithm is: + * + * 1) Bump link count on source while we're linking it to the + * target. This also ensure the inode won't be deleted out + * from underneath us while we work (it may be truncated by + * a concurrent `trunc' or `open' for creation). + * 2) Link source to destination. If destination already exists, + * delete it first. + * 3) Unlink source reference to inode if still around. If a + * directory was moved and the parent of the destination + * is different from the source, patch the ".." entry in the + * directory. + */ +int +ext2fs_rename(void *v) +{ + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap = v; + struct vnode *tvp = ap->a_tvp; + struct vnode *tdvp = ap->a_tdvp; + struct vnode *fvp = ap->a_fvp; + struct vnode *fdvp = ap->a_fdvp; + struct componentname *tcnp = ap->a_tcnp; + struct componentname *fcnp = ap->a_fcnp; + struct inode *ip, *xp, *dp; + struct ext2fs_dirtemplate dirbuf; + int doingdirectory = 0, oldparent = 0, newparent = 0; + int error = 0; + u_char namlen; + + /* + * Check for cross-device rename. + */ + if ((fvp->v_mount != tdvp->v_mount) || + (tvp && (fvp->v_mount != tvp->v_mount))) { + error = EXDEV; +abortit: + VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */ + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */ + vrele(fdvp); + vrele(fvp); + return (error); + } + + /* + * Check if just deleting a link name. + */ + if (tvp && ((VTOI(tvp)->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) || + (VTOI(tdvp)->i_e2fs_flags & EXT2_APPEND))) { + error = EPERM; + goto abortit; + } + if (fvp == tvp) { + if (fvp->v_type == VDIR) { + error = EINVAL; + goto abortit; + } + + /* Release destination completely. */ + VOP_ABORTOP(tdvp, tcnp); + vput(tdvp); + vput(tvp); + + /* Delete source. */ + vrele(fvp); + fcnp->cn_flags &= ~(MODMASK); + fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; + fcnp->cn_nameiop = DELETE; + vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY); + if ((error = relookup(fdvp, &fvp, fcnp, 0))) { + vput(fdvp); + return (error); + } + return (VOP_REMOVE(fdvp, fvp, fcnp)); + } + if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) + goto abortit; + dp = VTOI(fdvp); + ip = VTOI(fvp); + if ((nlink_t) ip->i_e2fs_nlink >= LINK_MAX) { + VOP_UNLOCK(fvp); + error = EMLINK; + goto abortit; + } + if ((ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) || + (dp->i_e2fs_flags & EXT2_APPEND)) { + VOP_UNLOCK(fvp); + error = EPERM; + goto abortit; + } + if ((ip->i_e2fs_mode & IFMT) == IFDIR) { + error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred); + if (!error && tvp) + error = VOP_ACCESS(tvp, VWRITE, tcnp->cn_cred); + if (error) { + VOP_UNLOCK(fvp); + error = EACCES; + goto abortit; + } + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || + dp == ip || + (fcnp->cn_flags & ISDOTDOT) || + (tcnp->cn_flags & ISDOTDOT) || + (ip->i_flag & IN_RENAME)) { + VOP_UNLOCK(fvp); + error = EINVAL; + goto abortit; + } + ip->i_flag |= IN_RENAME; + oldparent = dp->i_number; + doingdirectory = 1; + } + VN_KNOTE(fdvp, NOTE_WRITE); /* XXXLUKEM/XXX: right place? */ + + /* + * When the target exists, both the directory + * and target vnodes are returned locked. + */ + dp = VTOI(tdvp); + xp = NULL; + if (tvp) + xp = VTOI(tvp); + + /* + * 1) Bump link count while we're moving stuff + * around. If we crash somewhere before + * completing our work, the link count + * may be wrong, but correctable. + */ + ip->i_e2fs_nlink++; + ip->i_flag |= IN_CHANGE; + if ((error = ext2fs_update(fvp, NULL, NULL, UPDATE_WAIT)) != 0) { + VOP_UNLOCK(fvp); + goto bad; + } + + /* + * If ".." must be changed (ie the directory gets a new + * parent) then the source directory must not be in the + * directory hierarchy above the target, as this would + * orphan everything below the source directory. Also + * the user must have write permission in the source so + * as to be able to change "..". We must repeat the call + * to namei, as the parent directory is unlocked by the + * call to checkpath(). + */ + error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred); + VOP_UNLOCK(fvp); + if (oldparent != dp->i_number) + newparent = dp->i_number; + if (doingdirectory && newparent) { + if (error) /* write access check above */ + goto bad; + if (xp != NULL) + vput(tvp); + vref(tdvp); /* compensate for the ref checkpath loses */ + error = ext2fs_checkpath(ip, dp, tcnp->cn_cred); + if (error != 0) { + vrele(tdvp); + goto out; + } + vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY); + if ((error = relookup(tdvp, &tvp, tcnp, 0)) != 0) { + vput(tdvp); + goto out; + } + dp = VTOI(tdvp); + xp = NULL; + if (tvp) + xp = VTOI(tvp); + } + /* + * 2) If target doesn't exist, link the target + * to the source and unlink the source. + * Otherwise, rewrite the target directory + * entry to reference the source inode and + * expunge the original entry's existence. + */ + if (xp == NULL) { + if (dp->i_dev != ip->i_dev) + panic("rename: EXDEV"); + /* + * Account for ".." in new directory. + * When source and destination have the same + * parent we don't fool with the link count. + */ + if (doingdirectory && newparent) { + if ((nlink_t)dp->i_e2fs_nlink >= LINK_MAX) { + error = EMLINK; + goto bad; + } + dp->i_e2fs_nlink++; + dp->i_flag |= IN_CHANGE; + if ((error = ext2fs_update(tdvp, NULL, NULL, + UPDATE_WAIT)) != 0) + goto bad; + } + error = ext2fs_direnter(ip, tdvp, &VTOI(tdvp)->i_crap, tcnp); + if (error != 0) { + if (doingdirectory && newparent) { + dp->i_e2fs_nlink--; + dp->i_flag |= IN_CHANGE; + (void)ext2fs_update(tdvp, NULL, NULL, + UPDATE_WAIT); + } + goto bad; + } + VN_KNOTE(tdvp, NOTE_WRITE); + vput(tdvp); + } else { + if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) + panic("rename: EXDEV"); + /* + * Short circuit rename(foo, foo). + */ + if (xp->i_number == ip->i_number) + panic("rename: same file"); + /* + * If the parent directory is "sticky", then the user must + * own the parent directory, or the destination of the rename, + * otherwise the destination may not be changed (except by + * root). This implements append-only directories. + */ + if ((dp->i_e2fs_mode & S_ISTXT) && + kauth_authorize_generic(tcnp->cn_cred, + KAUTH_GENERIC_ISSUSER, NULL) != 0 && + kauth_cred_geteuid(tcnp->cn_cred) != dp->i_uid && + xp->i_uid != kauth_cred_geteuid(tcnp->cn_cred)) { + error = EPERM; + goto bad; + } + /* + * Target must be empty if a directory and have no links + * to it. Also, ensure source and target are compatible + * (both directories, or both not directories). + */ + if ((xp->i_e2fs_mode & IFMT) == IFDIR) { + if (!ext2fs_dirempty(xp, dp->i_number, tcnp->cn_cred) || + xp->i_e2fs_nlink > 2) { + error = ENOTEMPTY; + goto bad; + } + if (!doingdirectory) { + error = ENOTDIR; + goto bad; + } + cache_purge(tdvp); + } else if (doingdirectory) { + error = EISDIR; + goto bad; + } + error = ext2fs_dirrewrite(dp, &dp->i_crap, ip, tcnp); + if (error != 0) + goto bad; + /* + * If the target directory is in the same + * directory as the source directory, + * decrement the link count on the parent + * of the target directory. + */ + if (doingdirectory && !newparent) { + dp->i_e2fs_nlink--; + dp->i_flag |= IN_CHANGE; + } + /* + * Adjust the link count of the target to + * reflect the dirrewrite above. If this is + * a directory it is empty and there are + * no links to it, so we can squash the inode and + * any space associated with it. We disallowed + * renaming over top of a directory with links to + * it above, as the remaining link would point to + * a directory without "." or ".." entries. + */ + xp->i_e2fs_nlink--; + if (doingdirectory) { + if (--xp->i_e2fs_nlink != 0) + panic("rename: linked directory"); + error = ext2fs_truncate(tvp, (off_t)0, IO_SYNC, + tcnp->cn_cred); + } + xp->i_flag |= IN_CHANGE; + VN_KNOTE(tdvp, NOTE_WRITE); + vput(tdvp); + VN_KNOTE(tvp, NOTE_DELETE); + vput(tvp); + xp = NULL; + } + + /* + * 3) Unlink the source. + */ + fcnp->cn_flags &= ~(MODMASK); + fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; + vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY); + if ((error = relookup(fdvp, &fvp, fcnp, 0))) { + vput(fdvp); + vrele(ap->a_fvp); + return (error); + } + if (fvp != NULL) { + xp = VTOI(fvp); + dp = VTOI(fdvp); + } else { + /* + * From name has disappeared. + */ + if (doingdirectory) + panic("ext2fs_rename: lost dir entry"); + vrele(ap->a_fvp); + return (0); + } + /* + * Ensure that the directory entry still exists and has not + * changed while the new name has been entered. If the source is + * a file then the entry may have been unlinked or renamed. In + * either case there is no further work to be done. If the source + * is a directory then it cannot have been rmdir'ed; its link + * count of three would cause a rmdir to fail with ENOTEMPTY. + * The IRENAME flag ensures that it cannot be moved by another + * rename. + */ + if (xp != ip) { + if (doingdirectory) + panic("ext2fs_rename: lost dir entry"); + } else { + /* + * If the source is a directory with a + * new parent, the link count of the old + * parent directory must be decremented + * and ".." set to point to the new parent. + */ + if (doingdirectory && newparent) { + KASSERT(dp != NULL); + dp->i_e2fs_nlink--; + dp->i_flag |= IN_CHANGE; + error = vn_rdwr(UIO_READ, fvp, (void *)&dirbuf, + sizeof (struct ext2fs_dirtemplate), (off_t)0, + UIO_SYSSPACE, IO_NODELOCKED, + tcnp->cn_cred, (size_t *)0, NULL); + if (error == 0) { + namlen = dirbuf.dotdot_namlen; + if (namlen != 2 || + dirbuf.dotdot_name[0] != '.' || + dirbuf.dotdot_name[1] != '.') { + ufs_dirbad(xp, (doff_t)12, + "ext2fs_rename: mangled dir"); + } else { + dirbuf.dotdot_ino = h2fs32(newparent); + (void) vn_rdwr(UIO_WRITE, fvp, + (void *)&dirbuf, + sizeof (struct dirtemplate), + (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED|IO_SYNC, + tcnp->cn_cred, (size_t *)0, + NULL); + cache_purge(fdvp); + } + } + } + error = ext2fs_dirremove(fdvp, &VTOI(fdvp)->i_crap, fcnp); + if (!error) { + xp->i_e2fs_nlink--; + xp->i_flag |= IN_CHANGE; + } + xp->i_flag &= ~IN_RENAME; + } + VN_KNOTE(fvp, NOTE_RENAME); + if (dp) + vput(fdvp); + if (xp) + vput(fvp); + vrele(ap->a_fvp); + return (error); + +bad: + if (xp) + vput(ITOV(xp)); + vput(ITOV(dp)); +out: + if (doingdirectory) + ip->i_flag &= ~IN_RENAME; + if (vn_lock(fvp, LK_EXCLUSIVE) == 0) { + ip->i_e2fs_nlink--; + ip->i_flag |= IN_CHANGE; + vput(fvp); + } else + vrele(fvp); + vrele(fdvp); + return (error); +} + +/* + * Mkdir system call + */ +int +ext2fs_mkdir(void *v) +{ + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap = v; + struct vnode *dvp = ap->a_dvp; + struct vattr *vap = ap->a_vap; + struct componentname *cnp = ap->a_cnp; + struct inode *ip, *dp = VTOI(dvp); + struct vnode *tvp; + struct ext2fs_dirtemplate dirtemplate; + int error, dmode; + struct ufs_lookup_results *ulr; + + /* XXX should handle this material another way */ + ulr = &VTOI(dvp)->i_crap; + UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); + + if ((nlink_t)dp->i_e2fs_nlink >= LINK_MAX) { + error = EMLINK; + goto out; + } + dmode = vap->va_mode & ACCESSPERMS; + dmode |= IFDIR; + /* + * Must simulate part of ext2fs_makeinode here to acquire the inode, + * but not have it entered in the parent directory. The entry is + * made later after writing "." and ".." entries. + */ + if ((error = ext2fs_valloc(dvp, dmode, cnp->cn_cred, &tvp)) != 0) + goto out; + ip = VTOI(tvp); + ip->i_uid = kauth_cred_geteuid(cnp->cn_cred); + ip->i_e2fs_uid = ip->i_uid & 0xffff; + ip->i_e2fs_gid = dp->i_e2fs_gid; + if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) { + ip->i_e2fs_uid_high = (ip->i_uid >> 16) & 0xffff; + ip->i_e2fs_gid_high = dp->i_e2fs_gid_high; + } else { + ip->i_e2fs_uid_high = 0; + ip->i_e2fs_gid_high = 0; + } + ip->i_gid = ip->i_e2fs_gid | (ip->i_e2fs_gid_high << 16); + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + ip->i_e2fs_mode = dmode; + tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ + ip->i_e2fs_nlink = 2; + + /* + * Bump link count in parent directory + * to reflect work done below. Should + * be done before reference is created + * so reparation is possible if we crash. + */ + dp->i_e2fs_nlink++; + dp->i_flag |= IN_CHANGE; + if ((error = ext2fs_update(dvp, NULL, NULL, UPDATE_DIROP)) != 0) + goto bad; + + /* Initialize directory with "." and ".." from static template. */ + memset(&dirtemplate, 0, sizeof(dirtemplate)); + dirtemplate.dot_ino = h2fs32(ip->i_number); + dirtemplate.dot_reclen = h2fs16(12); + dirtemplate.dot_namlen = 1; + if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 && + (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) { + dirtemplate.dot_type = EXT2_FT_DIR; + } + dirtemplate.dot_name[0] = '.'; + dirtemplate.dotdot_ino = h2fs32(dp->i_number); + dirtemplate.dotdot_reclen = h2fs16(VTOI(dvp)->i_e2fs->e2fs_bsize - 12); + dirtemplate.dotdot_namlen = 2; + if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 && + (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) { + dirtemplate.dotdot_type = EXT2_FT_DIR; + } + dirtemplate.dotdot_name[0] = dirtemplate.dotdot_name[1] = '.'; + error = vn_rdwr(UIO_WRITE, tvp, (void *)&dirtemplate, + sizeof (dirtemplate), (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED|IO_SYNC, cnp->cn_cred, (size_t *)0, NULL); + if (error) { + dp->i_e2fs_nlink--; + dp->i_flag |= IN_CHANGE; + goto bad; + } + if (VTOI(dvp)->i_e2fs->e2fs_bsize > dvp->v_mount->mnt_stat.f_bsize) + panic("ext2fs_mkdir: blksize"); /* XXX should grow with balloc() */ + else { + error = ext2fs_setsize(ip, VTOI(dvp)->i_e2fs->e2fs_bsize); + if (error) { + dp->i_e2fs_nlink--; + dp->i_flag |= IN_CHANGE; + goto bad; + } + ip->i_flag |= IN_CHANGE; + uvm_vnp_setsize(tvp, ext2fs_size(ip)); + } + + /* Directory set up, now install it's entry in the parent directory. */ + error = ext2fs_direnter(ip, dvp, ulr, cnp); + if (error != 0) { + dp->i_e2fs_nlink--; + dp->i_flag |= IN_CHANGE; + } +bad: + /* + * No need to do an explicit ext2fs_truncate here, vrele will do this + * for us because we set the link count to 0. + */ + if (error) { + ip->i_e2fs_nlink = 0; + ip->i_flag |= IN_CHANGE; + vput(tvp); + } else { + VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); + *ap->a_vpp = tvp; + } +out: + vput(dvp); + return (error); +} + +/* + * Rmdir system call. + */ +int +ext2fs_rmdir(void *v) +{ + struct vop_rmdir_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct vnode *dvp = ap->a_dvp; + struct componentname *cnp = ap->a_cnp; + struct inode *ip, *dp; + int error; + struct ufs_lookup_results *ulr; + + ip = VTOI(vp); + dp = VTOI(dvp); + + /* XXX should handle this material another way */ + ulr = &dp->i_crap; + UFS_CHECK_CRAPCOUNTER(dp); + + /* + * No rmdir "." please. + */ + if (dp == ip) { + vrele(dvp); + vput(vp); + return (EINVAL); + } + /* + * Verify the directory is empty (and valid). + * (Rmdir ".." won't be valid since + * ".." will contain a reference to + * the current directory and thus be + * non-empty.) + */ + error = 0; + if (ip->i_e2fs_nlink != 2 || + !ext2fs_dirempty(ip, dp->i_number, cnp->cn_cred)) { + error = ENOTEMPTY; + goto out; + } + if ((dp->i_e2fs_flags & EXT2_APPEND) || + (ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND))) { + error = EPERM; + goto out; + } + /* + * Delete reference to directory before purging + * inode. If we crash in between, the directory + * will be reattached to lost+found, + */ + error = ext2fs_dirremove(dvp, ulr, cnp); + if (error != 0) + goto out; + dp->i_e2fs_nlink--; + dp->i_flag |= IN_CHANGE; + VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); + cache_purge(dvp); + vput(dvp); + dvp = NULL; + /* + * Truncate inode. The only stuff left + * in the directory is "." and "..". The + * "." reference is inconsequential since + * we're quashing it. The ".." reference + * has already been adjusted above. We've + * removed the "." reference and the reference + * in the parent directory, but there may be + * other hard links so decrement by 2 and + * worry about them later. + */ + ip->i_e2fs_nlink -= 2; + error = ext2fs_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred); + cache_purge(ITOV(ip)); +out: + VN_KNOTE(vp, NOTE_DELETE); + if (dvp) + vput(dvp); + vput(vp); + return (error); +} + +/* + * symlink -- make a symbolic link + */ +int +ext2fs_symlink(void *v) +{ + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap = v; + struct vnode *vp, **vpp; + struct inode *ip; + int len, error; + + vpp = ap->a_vpp; + error = ext2fs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, + vpp, ap->a_cnp); + if (error) + return (error); + VN_KNOTE(ap->a_dvp, NOTE_WRITE); + vp = *vpp; + len = strlen(ap->a_target); + ip = VTOI(vp); + if (len < ip->i_ump->um_maxsymlinklen) { + memcpy((char *)ip->i_din.e2fs_din->e2di_shortlink, ap->a_target, len); + error = ext2fs_setsize(ip, len); + if (error) + goto bad; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (vp->v_mount->mnt_flag & MNT_RELATIME) + ip->i_flag |= IN_ACCESS; + uvm_vnp_setsize(vp, len); + } else + error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, + UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred, + (size_t *)0, NULL); +bad: + if (error) + vput(vp); + return (error); +} + +/* + * Return target name of a symbolic link + */ +int +ext2fs_readlink(void *v) +{ + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct ufsmount *ump = ip->i_ump; + int isize; + + isize = ext2fs_size(ip); + if (isize < ump->um_maxsymlinklen || + (ump->um_maxsymlinklen == 0 && ip->i_e2fs_nblock == 0)) { + uiomove((char *)ip->i_din.e2fs_din->e2di_shortlink, isize, ap->a_uio); + return (0); + } + return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); +} + +/* + * Advisory record locking support + */ +int +ext2fs_advlock(void *v) +{ + struct vop_advlock_args /* { + struct vnode *a_vp; + void * a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap = v; + struct inode *ip = VTOI(ap->a_vp); + + return lf_advlock(ap, &ip->i_lockf, ext2fs_size(ip)); +} + +int +ext2fs_fsync(void *v) +{ + struct vop_fsync_args /* { + struct vnode *a_vp; + kauth_cred_t a_cred; + int a_flags; + off_t offlo; + off_t offhi; + struct proc *a_p; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + int wait; + int error; + + wait = (ap->a_flags & FSYNC_WAIT) != 0; + + if (vp->v_type == VBLK) + error = spec_fsync(v); + else + error = vflushbuf(vp, wait); + if (error == 0 && (ap->a_flags & FSYNC_DATAONLY) == 0) + error = ext2fs_update(vp, NULL, NULL, wait ? UPDATE_WAIT : 0); + + if (error == 0 && ap->a_flags & FSYNC_CACHE) { + int l = 0; + error = VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE, + curlwp->l_cred); + } + + return error; +} + +/* + * Initialize the vnode associated with a new inode, handle aliased + * vnodes. + */ +int +ext2fs_vinit(struct mount *mntp, int (**specops)(void *), + int (**fifoops)(void *), struct vnode **vpp) +{ + struct timeval tv; + struct inode *ip; + struct vnode *vp; + + vp = *vpp; + ip = VTOI(vp); + switch(vp->v_type = IFTOVT(ip->i_e2fs_mode)) { + case VCHR: + case VBLK: + vp->v_op = specops; + spec_node_init(vp, fs2h32(ip->i_din.e2fs_din->e2di_rdev)); + break; + case VFIFO: + vp->v_op = fifoops; + break; + case VNON: + case VBAD: + case VSOCK: + case VLNK: + case VDIR: + case VREG: + break; + } + if (ip->i_number == ROOTINO) + vp->v_vflag |= VV_ROOT; + /* + * Initialize modrev times + */ + getmicrouptime(&tv); + SETHIGH(ip->i_modrev, tv.tv_sec); + SETLOW(ip->i_modrev, tv.tv_usec * 4294); + *vpp = vp; + return (0); +} + +/* + * Allocate a new inode. + */ +int +ext2fs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp, + struct componentname *cnp) +{ + struct inode *ip, *pdir; + struct vnode *tvp; + int error, ismember = 0; + struct ufs_lookup_results *ulr; + + pdir = VTOI(dvp); + + /* XXX should handle this material another way */ + ulr = &pdir->i_crap; + UFS_CHECK_CRAPCOUNTER(pdir); + + *vpp = NULL; + if ((mode & IFMT) == 0) + mode |= IFREG; + + if ((error = ext2fs_valloc(dvp, mode, cnp->cn_cred, &tvp)) != 0) { + vput(dvp); + return (error); + } + ip = VTOI(tvp); + ip->i_uid = kauth_cred_geteuid(cnp->cn_cred); + ip->i_e2fs_uid = ip->i_uid & 0xffff; + ip->i_e2fs_gid = pdir->i_e2fs_gid; + if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) { + ip->i_e2fs_uid_high = (ip->i_uid >> 16) & 0xffff; + ip->i_e2fs_gid_high = pdir->i_e2fs_gid_high; + } else { + ip->i_e2fs_uid_high = 0; + ip->i_e2fs_gid_high = 0; + } + ip->i_gid = ip->i_e2fs_gid | (ip->i_e2fs_gid_high << 16); + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + ip->i_e2fs_mode = mode; + tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ + ip->i_e2fs_nlink = 1; + if ((ip->i_e2fs_mode & ISGID) && (kauth_cred_ismember_gid(cnp->cn_cred, + ip->i_gid, &ismember) != 0 || !ismember) && + kauth_authorize_generic(cnp->cn_cred, KAUTH_GENERIC_ISSUSER, NULL)) + ip->i_e2fs_mode &= ~ISGID; + + /* + * Make sure inode goes to disk before directory entry. + */ + if ((error = ext2fs_update(tvp, NULL, NULL, UPDATE_WAIT)) != 0) + goto bad; + error = ext2fs_direnter(ip, dvp, ulr, cnp); + if (error != 0) + goto bad; + vput(dvp); + *vpp = tvp; + return (0); + +bad: + /* + * Write error occurred trying to update the inode + * or the directory so must deallocate the inode. + */ + tvp->v_type = VNON; /* Stop explosion if VBLK */ + ip->i_e2fs_nlink = 0; + ip->i_flag |= IN_CHANGE; + vput(tvp); + vput(dvp); + return (error); +} + +/* + * Reclaim an inode so that it can be used for other purposes. + */ +int +ext2fs_reclaim(void *v) +{ + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + int error; + + /* + * The inode must be freed and updated before being removed + * from its hash chain. Other threads trying to gain a hold + * on the inode will be stalled because it is locked (VI_XLOCK). + */ + if (ip->i_omode == 1 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) + ext2fs_vfree(vp, ip->i_number, ip->i_e2fs_mode); + if ((error = ufs_reclaim(vp)) != 0) + return (error); + if (ip->i_din.e2fs_din != NULL) + pool_put(&ext2fs_dinode_pool, ip->i_din.e2fs_din); + genfs_node_destroy(vp); + pool_put(&ext2fs_inode_pool, vp->v_data); + vp->v_data = NULL; + return (0); +} + +/* Global vfs data structures for ext2fs. */ +int (**ext2fs_vnodeop_p)(void *); +const struct vnodeopv_entry_desc ext2fs_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, ext2fs_lookup }, /* lookup */ + { &vop_create_desc, ext2fs_create }, /* create */ + { &vop_mknod_desc, ext2fs_mknod }, /* mknod */ + { &vop_open_desc, ext2fs_open }, /* open */ + { &vop_close_desc, ufs_close }, /* close */ + { &vop_access_desc, ext2fs_access }, /* access */ + { &vop_getattr_desc, ext2fs_getattr }, /* getattr */ + { &vop_setattr_desc, ext2fs_setattr }, /* setattr */ + { &vop_read_desc, ext2fs_read }, /* read */ + { &vop_write_desc, ext2fs_write }, /* write */ + { &vop_ioctl_desc, ufs_ioctl }, /* ioctl */ + { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */ + { &vop_poll_desc, ufs_poll }, /* poll */ + { &vop_kqfilter_desc, genfs_kqfilter }, /* kqfilter */ + { &vop_revoke_desc, ufs_revoke }, /* revoke */ + { &vop_mmap_desc, ufs_mmap }, /* mmap */ + { &vop_fsync_desc, ext2fs_fsync }, /* fsync */ + { &vop_seek_desc, ufs_seek }, /* seek */ + { &vop_remove_desc, ext2fs_remove }, /* remove */ + { &vop_link_desc, ext2fs_link }, /* link */ + { &vop_rename_desc, ext2fs_rename }, /* rename */ + { &vop_mkdir_desc, ext2fs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, ext2fs_rmdir }, /* rmdir */ + { &vop_symlink_desc, ext2fs_symlink }, /* symlink */ + { &vop_readdir_desc, ext2fs_readdir }, /* readdir */ + { &vop_readlink_desc, ext2fs_readlink }, /* readlink */ + { &vop_abortop_desc, ufs_abortop }, /* abortop */ + { &vop_inactive_desc, ext2fs_inactive }, /* inactive */ + { &vop_reclaim_desc, ext2fs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, ext2fs_bmap }, /* bmap */ + { &vop_strategy_desc, ufs_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */ + { &vop_advlock_desc, ext2fs_advlock }, /* advlock */ + { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ + { &vop_getpages_desc, genfs_getpages }, /* getpages */ + { &vop_putpages_desc, genfs_putpages }, /* putpages */ + { NULL, NULL } +}; +const struct vnodeopv_desc ext2fs_vnodeop_opv_desc = + { &ext2fs_vnodeop_p, ext2fs_vnodeop_entries }; + +int (**ext2fs_specop_p)(void *); +const struct vnodeopv_entry_desc ext2fs_specop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, spec_lookup }, /* lookup */ + { &vop_create_desc, spec_create }, /* create */ + { &vop_mknod_desc, spec_mknod }, /* mknod */ + { &vop_open_desc, spec_open }, /* open */ + { &vop_close_desc, ufsspec_close }, /* close */ + { &vop_access_desc, ext2fs_access }, /* access */ + { &vop_getattr_desc, ext2fs_getattr }, /* getattr */ + { &vop_setattr_desc, ext2fs_setattr }, /* setattr */ + { &vop_read_desc, ufsspec_read }, /* read */ + { &vop_write_desc, ufsspec_write }, /* write */ + { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ + { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */ + { &vop_poll_desc, spec_poll }, /* poll */ + { &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */ + { &vop_revoke_desc, spec_revoke }, /* revoke */ + { &vop_mmap_desc, spec_mmap }, /* mmap */ + { &vop_fsync_desc, ext2fs_fsync }, /* fsync */ + { &vop_seek_desc, spec_seek }, /* seek */ + { &vop_remove_desc, spec_remove }, /* remove */ + { &vop_link_desc, spec_link }, /* link */ + { &vop_rename_desc, spec_rename }, /* rename */ + { &vop_mkdir_desc, spec_mkdir }, /* mkdir */ + { &vop_rmdir_desc, spec_rmdir }, /* rmdir */ + { &vop_symlink_desc, spec_symlink }, /* symlink */ + { &vop_readdir_desc, spec_readdir }, /* readdir */ + { &vop_readlink_desc, spec_readlink }, /* readlink */ + { &vop_abortop_desc, spec_abortop }, /* abortop */ + { &vop_inactive_desc, ext2fs_inactive }, /* inactive */ + { &vop_reclaim_desc, ext2fs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, spec_bmap }, /* bmap */ + { &vop_strategy_desc, spec_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ + { &vop_advlock_desc, spec_advlock }, /* advlock */ + { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ + { &vop_getpages_desc, spec_getpages }, /* getpages */ + { &vop_putpages_desc, spec_putpages }, /* putpages */ + { NULL, NULL } +}; +const struct vnodeopv_desc ext2fs_specop_opv_desc = + { &ext2fs_specop_p, ext2fs_specop_entries }; + +int (**ext2fs_fifoop_p)(void *); +const struct vnodeopv_entry_desc ext2fs_fifoop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, vn_fifo_bypass }, /* lookup */ + { &vop_create_desc, vn_fifo_bypass }, /* create */ + { &vop_mknod_desc, vn_fifo_bypass }, /* mknod */ + { &vop_open_desc, vn_fifo_bypass }, /* open */ + { &vop_close_desc, ufsfifo_close }, /* close */ + { &vop_access_desc, ext2fs_access }, /* access */ + { &vop_getattr_desc, ext2fs_getattr }, /* getattr */ + { &vop_setattr_desc, ext2fs_setattr }, /* setattr */ + { &vop_read_desc, ufsfifo_read }, /* read */ + { &vop_write_desc, ufsfifo_write }, /* write */ + { &vop_ioctl_desc, vn_fifo_bypass }, /* ioctl */ + { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */ + { &vop_poll_desc, vn_fifo_bypass }, /* poll */ + { &vop_kqfilter_desc, vn_fifo_bypass }, /* kqfilter */ + { &vop_revoke_desc, vn_fifo_bypass }, /* revoke */ + { &vop_mmap_desc, vn_fifo_bypass }, /* mmap */ + { &vop_fsync_desc, ext2fs_fsync }, /* fsync */ + { &vop_seek_desc, vn_fifo_bypass }, /* seek */ + { &vop_remove_desc, vn_fifo_bypass }, /* remove */ + { &vop_link_desc, vn_fifo_bypass }, /* link */ + { &vop_rename_desc, vn_fifo_bypass }, /* rename */ + { &vop_mkdir_desc, vn_fifo_bypass }, /* mkdir */ + { &vop_rmdir_desc, vn_fifo_bypass }, /* rmdir */ + { &vop_symlink_desc, vn_fifo_bypass }, /* symlink */ + { &vop_readdir_desc, vn_fifo_bypass }, /* readdir */ + { &vop_readlink_desc, vn_fifo_bypass }, /* readlink */ + { &vop_abortop_desc, vn_fifo_bypass }, /* abortop */ + { &vop_inactive_desc, ext2fs_inactive }, /* inactive */ + { &vop_reclaim_desc, ext2fs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, vn_fifo_bypass }, /* bmap */ + { &vop_strategy_desc, vn_fifo_bypass }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, vn_fifo_bypass }, /* pathconf */ + { &vop_advlock_desc, vn_fifo_bypass }, /* advlock */ + { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ + { &vop_putpages_desc, vn_fifo_bypass }, /* putpages */ + { NULL, NULL } +}; +const struct vnodeopv_desc ext2fs_fifoop_opv_desc = + { &ext2fs_fifoop_p, ext2fs_fifoop_entries }; diff --git a/sys/ufs/ffs/Makefile b/sys/ufs/ffs/Makefile new file mode 100644 index 000000000..1f03afcc4 --- /dev/null +++ b/sys/ufs/ffs/Makefile @@ -0,0 +1,7 @@ +# $NetBSD: Makefile,v 1.1 1998/06/12 23:23:11 cgd Exp $ + +INCSDIR= /usr/include/ufs/ffs + +INCS= ffs_extern.h fs.h + +.include diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c new file mode 100644 index 000000000..411f1a83e --- /dev/null +++ b/sys/ufs/ffs/ffs_alloc.c @@ -0,0 +1,2030 @@ +/* $NetBSD: ffs_alloc.c,v 1.130 2011/11/28 08:05:07 tls Exp $ */ + +/*- + * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Marshall + * Kirk McKusick and Network Associates Laboratories, the Security + * Research Division of Network Associates, Inc. under DARPA/SPAWAR + * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS + * research program + * + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.130 2011/11/28 08:05:07 tls Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_ffs.h" +#include "opt_quota.h" +#include "opt_uvm_page_trkown.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef UVM_PAGE_TRKOWN +#include +#endif + +static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int, int); +static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int); +static ino_t ffs_dirpref(struct inode *); +static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int); +static void ffs_fserr(struct fs *, u_int, const char *); +static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int, + daddr_t (*)(struct inode *, int, daddr_t, int, int)); +static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int); +static int32_t ffs_mapsearch(struct fs *, struct cg *, + daddr_t, int); +static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *, + daddr_t, long, bool); +static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t, + int, bool); + +/* if 1, changes in optimalization strategy are logged */ +int ffs_log_changeopt = 0; + +/* in ffs_tables.c */ +extern const int inside[], around[]; +extern const u_char * const fragtbl[]; + +/* Basic consistency check for block allocations */ +static int +ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno, + long size, dev_t dev, ino_t inum) +{ + if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 || + fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { + printf("dev = 0x%llx, bno = %" PRId64 " bsize = %d, " + "size = %ld, fs = %s\n", + (long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt); + panic("%s: bad size", func); + } + + if (bno >= fs->fs_size) { + printf("bad block %" PRId64 ", ino %llu\n", bno, + (unsigned long long)inum); + ffs_fserr(fs, inum, "bad block"); + return EINVAL; + } + return 0; +} + +/* + * Allocate a block in the file system. + * + * The size of the requested block is given, which must be some + * multiple of fs_fsize and <= fs_bsize. + * A preference may be optionally specified. If a preference is given + * the following hierarchy is used to allocate a block: + * 1) allocate the requested block. + * 2) allocate a rotationally optimal block in the same cylinder. + * 3) allocate a block in the same cylinder group. + * 4) quadradically rehash into other cylinder groups, until an + * available block is located. + * If no block preference is given the following hierarchy is used + * to allocate a block: + * 1) allocate a block in the cylinder group that contains the + * inode for the file. + * 2) quadradically rehash into other cylinder groups, until an + * available block is located. + * + * => called with um_lock held + * => releases um_lock before returning + */ +int +ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, int flags, + kauth_cred_t cred, daddr_t *bnp) +{ + struct ufsmount *ump; + struct fs *fs; + daddr_t bno; + int cg; +#if defined(QUOTA) || defined(QUOTA2) + int error; +#endif + + fs = ip->i_fs; + ump = ip->i_ump; + + KASSERT(mutex_owned(&ump->um_lock)); + +#ifdef UVM_PAGE_TRKOWN + + /* + * Sanity-check that allocations within the file size + * do not allow other threads to read the stale contents + * of newly allocated blocks. + * Usually pages will exist to cover the new allocation. + * There is an optimization in ffs_write() where we skip + * creating pages if several conditions are met: + * - the file must not be mapped (in any user address space). + * - the write must cover whole pages and whole blocks. + * If those conditions are not met then pages must exist and + * be locked by the current thread. + */ + + if (ITOV(ip)->v_type == VREG && + lblktosize(fs, (voff_t)lbn) < round_page(ITOV(ip)->v_size)) { + struct vm_page *pg; + struct vnode *vp = ITOV(ip); + struct uvm_object *uobj = &vp->v_uobj; + voff_t off = trunc_page(lblktosize(fs, lbn)); + voff_t endoff = round_page(lblktosize(fs, lbn) + size); + + mutex_enter(uobj->vmobjlock); + while (off < endoff) { + pg = uvm_pagelookup(uobj, off); + KASSERT((pg == NULL && (vp->v_vflag & VV_MAPPED) == 0 && + (size & PAGE_MASK) == 0 && + blkoff(fs, size) == 0) || + (pg != NULL && pg->owner == curproc->p_pid && + pg->lowner == curlwp->l_lid)); + off += PAGE_SIZE; + } + mutex_exit(uobj->vmobjlock); + } +#endif + + *bnp = 0; +#ifdef DIAGNOSTIC + if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { + printf("dev = 0x%llx, bsize = %d, size = %d, fs = %s\n", + (unsigned long long)ip->i_dev, fs->fs_bsize, size, + fs->fs_fsmnt); + panic("ffs_alloc: bad size"); + } + if (cred == NOCRED) + panic("ffs_alloc: missing credential"); +#endif /* DIAGNOSTIC */ + if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) + goto nospace; + if (freespace(fs, fs->fs_minfree) <= 0 && + kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL, + NULL, NULL) != 0) + goto nospace; +#if defined(QUOTA) || defined(QUOTA2) + mutex_exit(&ump->um_lock); + if ((error = chkdq(ip, btodb(size), cred, 0)) != 0) + return (error); + mutex_enter(&ump->um_lock); +#endif + + if (bpref >= fs->fs_size) + bpref = 0; + if (bpref == 0) + cg = ino_to_cg(fs, ip->i_number); + else + cg = dtog(fs, bpref); + bno = ffs_hashalloc(ip, cg, bpref, size, flags, ffs_alloccg); + if (bno > 0) { + DIP_ADD(ip, blocks, btodb(size)); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + *bnp = bno; + return (0); + } +#if defined(QUOTA) || defined(QUOTA2) + /* + * Restore user's disk quota because allocation failed. + */ + (void) chkdq(ip, -btodb(size), cred, FORCE); +#endif + if (flags & B_CONTIG) { + /* + * XXX ump->um_lock handling is "suspect" at best. + * For the case where ffs_hashalloc() fails early + * in the B_CONTIG case we reach here with um_lock + * already unlocked, so we can't release it again + * like in the normal error path. See kern/39206. + * + * + * Fail silently - it's up to our caller to report + * errors. + */ + return (ENOSPC); + } +nospace: + mutex_exit(&ump->um_lock); + ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full"); + uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); + return (ENOSPC); +} + +/* + * Reallocate a fragment to a bigger size + * + * The number and size of the old block is given, and a preference + * and new size is also specified. The allocator attempts to extend + * the original block. Failing that, the regular block allocator is + * invoked to get an appropriate block. + * + * => called with um_lock held + * => return with um_lock released + */ +int +ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bpref, int osize, + int nsize, kauth_cred_t cred, struct buf **bpp, daddr_t *blknop) +{ + struct ufsmount *ump; + struct fs *fs; + struct buf *bp; + int cg, request, error; + daddr_t bprev, bno; + + fs = ip->i_fs; + ump = ip->i_ump; + + KASSERT(mutex_owned(&ump->um_lock)); + +#ifdef UVM_PAGE_TRKOWN + + /* + * Sanity-check that allocations within the file size + * do not allow other threads to read the stale contents + * of newly allocated blocks. + * Unlike in ffs_alloc(), here pages must always exist + * for such allocations, because only the last block of a file + * can be a fragment and ffs_write() will reallocate the + * fragment to the new size using ufs_balloc_range(), + * which always creates pages to cover blocks it allocates. + */ + + if (ITOV(ip)->v_type == VREG) { + struct vm_page *pg; + struct uvm_object *uobj = &ITOV(ip)->v_uobj; + voff_t off = trunc_page(lblktosize(fs, lbprev)); + voff_t endoff = round_page(lblktosize(fs, lbprev) + osize); + + mutex_enter(uobj->vmobjlock); + while (off < endoff) { + pg = uvm_pagelookup(uobj, off); + KASSERT(pg->owner == curproc->p_pid && + pg->lowner == curlwp->l_lid); + off += PAGE_SIZE; + } + mutex_exit(uobj->vmobjlock); + } +#endif + +#ifdef DIAGNOSTIC + if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || + (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { + printf( + "dev = 0x%llx, bsize = %d, osize = %d, nsize = %d, fs = %s\n", + (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize, + fs->fs_fsmnt); + panic("ffs_realloccg: bad size"); + } + if (cred == NOCRED) + panic("ffs_realloccg: missing credential"); +#endif /* DIAGNOSTIC */ + if (freespace(fs, fs->fs_minfree) <= 0 && + kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL, + NULL, NULL) != 0) { + mutex_exit(&ump->um_lock); + goto nospace; + } + if (fs->fs_magic == FS_UFS2_MAGIC) + bprev = ufs_rw64(ip->i_ffs2_db[lbprev], UFS_FSNEEDSWAP(fs)); + else + bprev = ufs_rw32(ip->i_ffs1_db[lbprev], UFS_FSNEEDSWAP(fs)); + + if (bprev == 0) { + printf("dev = 0x%llx, bsize = %d, bprev = %" PRId64 ", fs = %s\n", + (unsigned long long)ip->i_dev, fs->fs_bsize, bprev, + fs->fs_fsmnt); + panic("ffs_realloccg: bad bprev"); + } + mutex_exit(&ump->um_lock); + + /* + * Allocate the extra space in the buffer. + */ + if (bpp != NULL && + (error = bread(ITOV(ip), lbprev, osize, NOCRED, 0, &bp)) != 0) { + brelse(bp, 0); + return (error); + } +#if defined(QUOTA) || defined(QUOTA2) + if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) { + if (bpp != NULL) { + brelse(bp, 0); + } + return (error); + } +#endif + /* + * Check for extension in the existing location. + */ + cg = dtog(fs, bprev); + mutex_enter(&ump->um_lock); + if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) { + DIP_ADD(ip, blocks, btodb(nsize - osize)); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + + if (bpp != NULL) { + if (bp->b_blkno != fsbtodb(fs, bno)) + panic("bad blockno"); + allocbuf(bp, nsize, 1); + memset((char *)bp->b_data + osize, 0, nsize - osize); + mutex_enter(bp->b_objlock); + KASSERT(!cv_has_waiters(&bp->b_done)); + bp->b_oflags |= BO_DONE; + mutex_exit(bp->b_objlock); + *bpp = bp; + } + if (blknop != NULL) { + *blknop = bno; + } + return (0); + } + /* + * Allocate a new disk location. + */ + if (bpref >= fs->fs_size) + bpref = 0; + switch ((int)fs->fs_optim) { + case FS_OPTSPACE: + /* + * Allocate an exact sized fragment. Although this makes + * best use of space, we will waste time relocating it if + * the file continues to grow. If the fragmentation is + * less than half of the minimum free reserve, we choose + * to begin optimizing for time. + */ + request = nsize; + if (fs->fs_minfree < 5 || + fs->fs_cstotal.cs_nffree > + fs->fs_dsize * fs->fs_minfree / (2 * 100)) + break; + + if (ffs_log_changeopt) { + log(LOG_NOTICE, + "%s: optimization changed from SPACE to TIME\n", + fs->fs_fsmnt); + } + + fs->fs_optim = FS_OPTTIME; + break; + case FS_OPTTIME: + /* + * At this point we have discovered a file that is trying to + * grow a small fragment to a larger fragment. To save time, + * we allocate a full sized block, then free the unused portion. + * If the file continues to grow, the `ffs_fragextend' call + * above will be able to grow it in place without further + * copying. If aberrant programs cause disk fragmentation to + * grow within 2% of the free reserve, we choose to begin + * optimizing for space. + */ + request = fs->fs_bsize; + if (fs->fs_cstotal.cs_nffree < + fs->fs_dsize * (fs->fs_minfree - 2) / 100) + break; + + if (ffs_log_changeopt) { + log(LOG_NOTICE, + "%s: optimization changed from TIME to SPACE\n", + fs->fs_fsmnt); + } + + fs->fs_optim = FS_OPTSPACE; + break; + default: + printf("dev = 0x%llx, optim = %d, fs = %s\n", + (unsigned long long)ip->i_dev, fs->fs_optim, fs->fs_fsmnt); + panic("ffs_realloccg: bad optim"); + /* NOTREACHED */ + } + bno = ffs_hashalloc(ip, cg, bpref, request, 0, ffs_alloccg); + if (bno > 0) { + if ((ip->i_ump->um_mountp->mnt_wapbl) && + (ITOV(ip)->v_type != VREG)) { + UFS_WAPBL_REGISTER_DEALLOCATION( + ip->i_ump->um_mountp, fsbtodb(fs, bprev), + osize); + } else { + ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize, + ip->i_number); + } + if (nsize < request) { + if ((ip->i_ump->um_mountp->mnt_wapbl) && + (ITOV(ip)->v_type != VREG)) { + UFS_WAPBL_REGISTER_DEALLOCATION( + ip->i_ump->um_mountp, + fsbtodb(fs, (bno + numfrags(fs, nsize))), + request - nsize); + } else + ffs_blkfree(fs, ip->i_devvp, + bno + numfrags(fs, nsize), + (long)(request - nsize), ip->i_number); + } + DIP_ADD(ip, blocks, btodb(nsize - osize)); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (bpp != NULL) { + bp->b_blkno = fsbtodb(fs, bno); + allocbuf(bp, nsize, 1); + memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize); + mutex_enter(bp->b_objlock); + KASSERT(!cv_has_waiters(&bp->b_done)); + bp->b_oflags |= BO_DONE; + mutex_exit(bp->b_objlock); + *bpp = bp; + } + if (blknop != NULL) { + *blknop = bno; + } + return (0); + } + mutex_exit(&ump->um_lock); + +#if defined(QUOTA) || defined(QUOTA2) + /* + * Restore user's disk quota because allocation failed. + */ + (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); +#endif + if (bpp != NULL) { + brelse(bp, 0); + } + +nospace: + /* + * no space available + */ + ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full"); + uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); + return (ENOSPC); +} + +/* + * Allocate an inode in the file system. + * + * If allocating a directory, use ffs_dirpref to select the inode. + * If allocating in a directory, the following hierarchy is followed: + * 1) allocate the preferred inode. + * 2) allocate an inode in the same cylinder group. + * 3) quadradically rehash into other cylinder groups, until an + * available inode is located. + * If no inode preference is given the following hierarchy is used + * to allocate an inode: + * 1) allocate an inode in cylinder group 0. + * 2) quadradically rehash into other cylinder groups, until an + * available inode is located. + * + * => um_lock not held upon entry or return + */ +int +ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, + struct vnode **vpp) +{ + struct ufsmount *ump; + struct inode *pip; + struct fs *fs; + struct inode *ip; + struct timespec ts; + ino_t ino, ipref; + int cg, error; + + UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount); + + *vpp = NULL; + pip = VTOI(pvp); + fs = pip->i_fs; + ump = pip->i_ump; + + error = UFS_WAPBL_BEGIN(pvp->v_mount); + if (error) { + return error; + } + mutex_enter(&ump->um_lock); + if (fs->fs_cstotal.cs_nifree == 0) + goto noinodes; + + if ((mode & IFMT) == IFDIR) + ipref = ffs_dirpref(pip); + else + ipref = pip->i_number; + if (ipref >= fs->fs_ncg * fs->fs_ipg) + ipref = 0; + cg = ino_to_cg(fs, ipref); + /* + * Track number of dirs created one after another + * in a same cg without intervening by files. + */ + if ((mode & IFMT) == IFDIR) { + if (fs->fs_contigdirs[cg] < 255) + fs->fs_contigdirs[cg]++; + } else { + if (fs->fs_contigdirs[cg] > 0) + fs->fs_contigdirs[cg]--; + } + ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, ffs_nodealloccg); + if (ino == 0) + goto noinodes; + UFS_WAPBL_END(pvp->v_mount); + error = VFS_VGET(pvp->v_mount, ino, vpp); + if (error) { + int err; + err = UFS_WAPBL_BEGIN(pvp->v_mount); + if (err == 0) + ffs_vfree(pvp, ino, mode); + if (err == 0) + UFS_WAPBL_END(pvp->v_mount); + return (error); + } + KASSERT((*vpp)->v_type == VNON); + ip = VTOI(*vpp); + if (ip->i_mode) { +#if 0 + printf("mode = 0%o, inum = %d, fs = %s\n", + ip->i_mode, ip->i_number, fs->fs_fsmnt); +#else + printf("dmode %x mode %x dgen %x gen %x\n", + DIP(ip, mode), ip->i_mode, + DIP(ip, gen), ip->i_gen); + printf("size %llx blocks %llx\n", + (long long)DIP(ip, size), (long long)DIP(ip, blocks)); + printf("ino %llu ipref %llu\n", (unsigned long long)ino, + (unsigned long long)ipref); +#if 0 + error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), + (int)fs->fs_bsize, NOCRED, 0, &bp); +#endif + +#endif + panic("ffs_valloc: dup alloc"); + } + if (DIP(ip, blocks)) { /* XXX */ + printf("free inode %s/%llu had %" PRId64 " blocks\n", + fs->fs_fsmnt, (unsigned long long)ino, DIP(ip, blocks)); + DIP_ASSIGN(ip, blocks, 0); + } + ip->i_flag &= ~IN_SPACECOUNTED; + ip->i_flags = 0; + DIP_ASSIGN(ip, flags, 0); + /* + * Set up a new generation number for this inode. + */ + ip->i_gen++; + DIP_ASSIGN(ip, gen, ip->i_gen); + if (fs->fs_magic == FS_UFS2_MAGIC) { + vfs_timestamp(&ts); + ip->i_ffs2_birthtime = ts.tv_sec; + ip->i_ffs2_birthnsec = ts.tv_nsec; + } + return (0); +noinodes: + mutex_exit(&ump->um_lock); + UFS_WAPBL_END(pvp->v_mount); + ffs_fserr(fs, kauth_cred_geteuid(cred), "out of inodes"); + uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); + return (ENOSPC); +} + +/* + * Find a cylinder group in which to place a directory. + * + * The policy implemented by this algorithm is to allocate a + * directory inode in the same cylinder group as its parent + * directory, but also to reserve space for its files inodes + * and data. Restrict the number of directories which may be + * allocated one after another in the same cylinder group + * without intervening allocation of files. + * + * If we allocate a first level directory then force allocation + * in another cylinder group. + */ +static ino_t +ffs_dirpref(struct inode *pip) +{ + register struct fs *fs; + int cg, prefcg; + int64_t dirsize, cgsize, curdsz; + int avgifree, avgbfree, avgndir; + int minifree, minbfree, maxndir; + int mincg, minndir; + int maxcontigdirs; + + KASSERT(mutex_owned(&pip->i_ump->um_lock)); + + fs = pip->i_fs; + + avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; + avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; + avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; + + /* + * Force allocation in another cg if creating a first level dir. + */ + if (ITOV(pip)->v_vflag & VV_ROOT) { + prefcg = random() % fs->fs_ncg; + mincg = prefcg; + minndir = fs->fs_ipg; + for (cg = prefcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < minndir && + fs->fs_cs(fs, cg).cs_nifree >= avgifree && + fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + mincg = cg; + minndir = fs->fs_cs(fs, cg).cs_ndir; + } + for (cg = 0; cg < prefcg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < minndir && + fs->fs_cs(fs, cg).cs_nifree >= avgifree && + fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + mincg = cg; + minndir = fs->fs_cs(fs, cg).cs_ndir; + } + return ((ino_t)(fs->fs_ipg * mincg)); + } + + /* + * Count various limits which used for + * optimal allocation of a directory inode. + */ + maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); + minifree = avgifree - fs->fs_ipg / 4; + if (minifree < 0) + minifree = 0; + minbfree = avgbfree - fragstoblks(fs, fs->fs_fpg) / 4; + if (minbfree < 0) + minbfree = 0; + cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg; + dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir; + if (avgndir != 0) { + curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir; + if (dirsize < curdsz) + dirsize = curdsz; + } + if (cgsize < dirsize * 255) + maxcontigdirs = cgsize / dirsize; + else + maxcontigdirs = 255; + if (fs->fs_avgfpdir > 0) + maxcontigdirs = min(maxcontigdirs, + fs->fs_ipg / fs->fs_avgfpdir); + if (maxcontigdirs == 0) + maxcontigdirs = 1; + + /* + * Limit number of dirs in one cg and reserve space for + * regular files, but only if we have no deficit in + * inodes or space. + */ + prefcg = ino_to_cg(fs, pip->i_number); + for (cg = prefcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < maxndir && + fs->fs_cs(fs, cg).cs_nifree >= minifree && + fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { + if (fs->fs_contigdirs[cg] < maxcontigdirs) + return ((ino_t)(fs->fs_ipg * cg)); + } + for (cg = 0; cg < prefcg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < maxndir && + fs->fs_cs(fs, cg).cs_nifree >= minifree && + fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { + if (fs->fs_contigdirs[cg] < maxcontigdirs) + return ((ino_t)(fs->fs_ipg * cg)); + } + /* + * This is a backstop when we are deficient in space. + */ + for (cg = prefcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) + return ((ino_t)(fs->fs_ipg * cg)); + for (cg = 0; cg < prefcg; cg++) + if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) + break; + return ((ino_t)(fs->fs_ipg * cg)); +} + +/* + * Select the desired position for the next block in a file. The file is + * logically divided into sections. The first section is composed of the + * direct blocks. Each additional section contains fs_maxbpg blocks. + * + * If no blocks have been allocated in the first section, the policy is to + * request a block in the same cylinder group as the inode that describes + * the file. If no blocks have been allocated in any other section, the + * policy is to place the section in a cylinder group with a greater than + * average number of free blocks. An appropriate cylinder group is found + * by using a rotor that sweeps the cylinder groups. When a new group of + * blocks is needed, the sweep begins in the cylinder group following the + * cylinder group from which the previous allocation was made. The sweep + * continues until a cylinder group with greater than the average number + * of free blocks is found. If the allocation is for the first block in an + * indirect block, the information on the previous allocation is unavailable; + * here a best guess is made based upon the logical block number being + * allocated. + * + * If a section is already partially allocated, the policy is to + * contiguously allocate fs_maxcontig blocks. The end of one of these + * contiguous blocks and the beginning of the next is laid out + * contigously if possible. + * + * => um_lock held on entry and exit + */ +daddr_t +ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags, + int32_t *bap /* XXX ondisk32 */) +{ + struct fs *fs; + int cg; + int avgbfree, startcg; + + KASSERT(mutex_owned(&ip->i_ump->um_lock)); + + fs = ip->i_fs; + + /* + * If allocating a contiguous file with B_CONTIG, use the hints + * in the inode extentions to return the desired block. + * + * For metadata (indirect blocks) return the address of where + * the first indirect block resides - we'll scan for the next + * available slot if we need to allocate more than one indirect + * block. For data, return the address of the actual block + * relative to the address of the first data block. + */ + if (flags & B_CONTIG) { + KASSERT(ip->i_ffs_first_data_blk != 0); + KASSERT(ip->i_ffs_first_indir_blk != 0); + if (flags & B_METAONLY) + return ip->i_ffs_first_indir_blk; + else + return ip->i_ffs_first_data_blk + blkstofrags(fs, lbn); + } + + if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { + if (lbn < NDADDR + NINDIR(fs)) { + cg = ino_to_cg(fs, ip->i_number); + return (cgbase(fs, cg) + fs->fs_frag); + } + /* + * Find a cylinder with greater than average number of + * unused data blocks. + */ + if (indx == 0 || bap[indx - 1] == 0) + startcg = + ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; + else + startcg = dtog(fs, + ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1); + startcg %= fs->fs_ncg; + avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; + for (cg = startcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + return (cgbase(fs, cg) + fs->fs_frag); + } + for (cg = 0; cg < startcg; cg++) + if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + return (cgbase(fs, cg) + fs->fs_frag); + } + return (0); + } + /* + * We just always try to lay things out contiguously. + */ + return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag; +} + +daddr_t +ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags, + int64_t *bap) +{ + struct fs *fs; + int cg; + int avgbfree, startcg; + + KASSERT(mutex_owned(&ip->i_ump->um_lock)); + + fs = ip->i_fs; + + /* + * If allocating a contiguous file with B_CONTIG, use the hints + * in the inode extentions to return the desired block. + * + * For metadata (indirect blocks) return the address of where + * the first indirect block resides - we'll scan for the next + * available slot if we need to allocate more than one indirect + * block. For data, return the address of the actual block + * relative to the address of the first data block. + */ + if (flags & B_CONTIG) { + KASSERT(ip->i_ffs_first_data_blk != 0); + KASSERT(ip->i_ffs_first_indir_blk != 0); + if (flags & B_METAONLY) + return ip->i_ffs_first_indir_blk; + else + return ip->i_ffs_first_data_blk + blkstofrags(fs, lbn); + } + + if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { + if (lbn < NDADDR + NINDIR(fs)) { + cg = ino_to_cg(fs, ip->i_number); + return (cgbase(fs, cg) + fs->fs_frag); + } + /* + * Find a cylinder with greater than average number of + * unused data blocks. + */ + if (indx == 0 || bap[indx - 1] == 0) + startcg = + ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; + else + startcg = dtog(fs, + ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1); + startcg %= fs->fs_ncg; + avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; + for (cg = startcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + return (cgbase(fs, cg) + fs->fs_frag); + } + for (cg = 0; cg < startcg; cg++) + if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + return (cgbase(fs, cg) + fs->fs_frag); + } + return (0); + } + /* + * We just always try to lay things out contiguously. + */ + return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag; +} + + +/* + * Implement the cylinder overflow algorithm. + * + * The policy implemented by this algorithm is: + * 1) allocate the block in its requested cylinder group. + * 2) quadradically rehash on the cylinder group number. + * 3) brute force search for a free block. + * + * => called with um_lock held + * => returns with um_lock released on success, held on failure + * (*allocator releases lock on success, retains lock on failure) + */ +/*VARARGS5*/ +static daddr_t +ffs_hashalloc(struct inode *ip, int cg, daddr_t pref, + int size /* size for data blocks, mode for inodes */, + int flags, daddr_t (*allocator)(struct inode *, int, daddr_t, int, int)) +{ + struct fs *fs; + daddr_t result; + int i, icg = cg; + + fs = ip->i_fs; + /* + * 1: preferred cylinder group + */ + result = (*allocator)(ip, cg, pref, size, flags); + if (result) + return (result); + + if (flags & B_CONTIG) + return (result); + /* + * 2: quadratic rehash + */ + for (i = 1; i < fs->fs_ncg; i *= 2) { + cg += i; + if (cg >= fs->fs_ncg) + cg -= fs->fs_ncg; + result = (*allocator)(ip, cg, 0, size, flags); + if (result) + return (result); + } + /* + * 3: brute force search + * Note that we start at i == 2, since 0 was checked initially, + * and 1 is always checked in the quadratic rehash. + */ + cg = (icg + 2) % fs->fs_ncg; + for (i = 2; i < fs->fs_ncg; i++) { + result = (*allocator)(ip, cg, 0, size, flags); + if (result) + return (result); + cg++; + if (cg == fs->fs_ncg) + cg = 0; + } + return (0); +} + +/* + * Determine whether a fragment can be extended. + * + * Check to see if the necessary fragments are available, and + * if they are, allocate them. + * + * => called with um_lock held + * => returns with um_lock released on success, held on failure + */ +static daddr_t +ffs_fragextend(struct inode *ip, int cg, daddr_t bprev, int osize, int nsize) +{ + struct ufsmount *ump; + struct fs *fs; + struct cg *cgp; + struct buf *bp; + daddr_t bno; + int frags, bbase; + int i, error; + u_int8_t *blksfree; + + fs = ip->i_fs; + ump = ip->i_ump; + + KASSERT(mutex_owned(&ump->um_lock)); + + if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) + return (0); + frags = numfrags(fs, nsize); + bbase = fragnum(fs, bprev); + if (bbase > fragnum(fs, (bprev + frags - 1))) { + /* cannot extend across a block boundary */ + return (0); + } + mutex_exit(&ump->um_lock); + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp); + if (error) + goto fail; + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) + goto fail; + cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs)); + if ((fs->fs_magic != FS_UFS1_MAGIC) || + (fs->fs_old_flags & FS_FLAGS_UPDATED)) + cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs)); + bno = dtogd(fs, bprev); + blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)); + for (i = numfrags(fs, osize); i < frags; i++) + if (isclr(blksfree, bno + i)) + goto fail; + /* + * the current fragment can be extended + * deduct the count on fragment being extended into + * increase the count on the remaining fragment (if any) + * allocate the extended piece + */ + for (i = frags; i < fs->fs_frag - bbase; i++) + if (isclr(blksfree, bno + i)) + break; + ufs_add32(cgp->cg_frsum[i - numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs)); + if (i != frags) + ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs)); + mutex_enter(&ump->um_lock); + for (i = numfrags(fs, osize); i < frags; i++) { + clrbit(blksfree, bno + i); + ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs)); + fs->fs_cstotal.cs_nffree--; + fs->fs_cs(fs, cg).cs_nffree--; + } + fs->fs_fmod = 1; + ACTIVECG_CLR(fs, cg); + mutex_exit(&ump->um_lock); + bdwrite(bp); + return (bprev); + + fail: + brelse(bp, 0); + mutex_enter(&ump->um_lock); + return (0); +} + +/* + * Determine whether a block can be allocated. + * + * Check to see if a block of the appropriate size is available, + * and if it is, allocate it. + */ +static daddr_t +ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size, int flags) +{ + struct ufsmount *ump; + struct fs *fs = ip->i_fs; + struct cg *cgp; + struct buf *bp; + int32_t bno; + daddr_t blkno; + int error, frags, allocsiz, i; + u_int8_t *blksfree; +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + + ump = ip->i_ump; + + KASSERT(mutex_owned(&ump->um_lock)); + + if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) + return (0); + mutex_exit(&ump->um_lock); + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp); + if (error) + goto fail; + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp, needswap) || + (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) + goto fail; + cgp->cg_old_time = ufs_rw32(time_second, needswap); + if ((fs->fs_magic != FS_UFS1_MAGIC) || + (fs->fs_old_flags & FS_FLAGS_UPDATED)) + cgp->cg_time = ufs_rw64(time_second, needswap); + if (size == fs->fs_bsize) { + mutex_enter(&ump->um_lock); + blkno = ffs_alloccgblk(ip, bp, bpref, flags); + ACTIVECG_CLR(fs, cg); + mutex_exit(&ump->um_lock); + bdwrite(bp); + return (blkno); + } + /* + * check to see if any fragments are already available + * allocsiz is the size which will be allocated, hacking + * it down to a smaller size if necessary + */ + blksfree = cg_blksfree(cgp, needswap); + frags = numfrags(fs, size); + for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) + if (cgp->cg_frsum[allocsiz] != 0) + break; + if (allocsiz == fs->fs_frag) { + /* + * no fragments were available, so a block will be + * allocated, and hacked up + */ + if (cgp->cg_cs.cs_nbfree == 0) + goto fail; + mutex_enter(&ump->um_lock); + blkno = ffs_alloccgblk(ip, bp, bpref, flags); + bno = dtogd(fs, blkno); + for (i = frags; i < fs->fs_frag; i++) + setbit(blksfree, bno + i); + i = fs->fs_frag - frags; + ufs_add32(cgp->cg_cs.cs_nffree, i, needswap); + fs->fs_cstotal.cs_nffree += i; + fs->fs_cs(fs, cg).cs_nffree += i; + fs->fs_fmod = 1; + ufs_add32(cgp->cg_frsum[i], 1, needswap); + ACTIVECG_CLR(fs, cg); + mutex_exit(&ump->um_lock); + bdwrite(bp); + return (blkno); + } + bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); +#if 0 + /* + * XXX fvdl mapsearch will panic, and never return -1 + * also: returning NULL as daddr_t ? + */ + if (bno < 0) + goto fail; +#endif + for (i = 0; i < frags; i++) + clrbit(blksfree, bno + i); + mutex_enter(&ump->um_lock); + ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap); + fs->fs_cstotal.cs_nffree -= frags; + fs->fs_cs(fs, cg).cs_nffree -= frags; + fs->fs_fmod = 1; + ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap); + if (frags != allocsiz) + ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap); + blkno = cgbase(fs, cg) + bno; + ACTIVECG_CLR(fs, cg); + mutex_exit(&ump->um_lock); + bdwrite(bp); + return blkno; + + fail: + brelse(bp, 0); + mutex_enter(&ump->um_lock); + return (0); +} + +/* + * Allocate a block in a cylinder group. + * + * This algorithm implements the following policy: + * 1) allocate the requested block. + * 2) allocate a rotationally optimal block in the same cylinder. + * 3) allocate the next available block on the block rotor for the + * specified cylinder group. + * Note that this routine only allocates fs_bsize blocks; these + * blocks may be fragmented by the routine that allocates them. + */ +static daddr_t +ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int flags) +{ + struct ufsmount *ump; + struct fs *fs = ip->i_fs; + struct cg *cgp; + int cg; + daddr_t blkno; + int32_t bno; + u_int8_t *blksfree; +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + + ump = ip->i_ump; + + KASSERT(mutex_owned(&ump->um_lock)); + + cgp = (struct cg *)bp->b_data; + blksfree = cg_blksfree(cgp, needswap); + if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) { + bpref = ufs_rw32(cgp->cg_rotor, needswap); + } else { + bpref = blknum(fs, bpref); + bno = dtogd(fs, bpref); + /* + * if the requested block is available, use it + */ + if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) + goto gotit; + /* + * if the requested data block isn't available and we are + * trying to allocate a contiguous file, return an error. + */ + if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG) + return (0); + } + + /* + * Take the next available block in this cylinder group. + */ + bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); + if (bno < 0) + return (0); + cgp->cg_rotor = ufs_rw32(bno, needswap); +gotit: + blkno = fragstoblks(fs, bno); + ffs_clrblock(fs, blksfree, blkno); + ffs_clusteracct(fs, cgp, blkno, -1); + ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); + fs->fs_cstotal.cs_nbfree--; + fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--; + if ((fs->fs_magic == FS_UFS1_MAGIC) && + ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { + int cylno; + cylno = old_cbtocylno(fs, bno); + KASSERT(cylno >= 0); + KASSERT(cylno < fs->fs_old_ncyl); + KASSERT(old_cbtorpos(fs, bno) >= 0); + KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos); + ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1, + needswap); + ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap); + } + fs->fs_fmod = 1; + cg = ufs_rw32(cgp->cg_cgx, needswap); + blkno = cgbase(fs, cg) + bno; + return (blkno); +} + +/* + * Determine whether an inode can be allocated. + * + * Check to see if an inode is available, and if it is, + * allocate it using the following policy: + * 1) allocate the requested inode. + * 2) allocate the next available inode after the requested + * inode in the specified cylinder group. + */ +static daddr_t +ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode, int flags) +{ + struct ufsmount *ump = ip->i_ump; + struct fs *fs = ip->i_fs; + struct cg *cgp; + struct buf *bp, *ibp; + u_int8_t *inosused; + int error, start, len, loc, map, i; + int32_t initediblk; + daddr_t nalloc; + struct ufs2_dinode *dp2; +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + + KASSERT(mutex_owned(&ump->um_lock)); + UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp); + + if (fs->fs_cs(fs, cg).cs_nifree == 0) + return (0); + mutex_exit(&ump->um_lock); + ibp = NULL; + initediblk = -1; +retry: + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp); + if (error) + goto fail; + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0) + goto fail; + + if (ibp != NULL && + initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) { + /* Another thread allocated more inodes so we retry the test. */ + brelse(ibp, 0); + ibp = NULL; + } + /* + * Check to see if we need to initialize more inodes. + */ + if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) { + initediblk = ufs_rw32(cgp->cg_initediblk, needswap); + nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap); + if (nalloc + INOPB(fs) > initediblk && + initediblk < ufs_rw32(cgp->cg_niblk, needswap)) { + /* + * We have to release the cg buffer here to prevent + * a deadlock when reading the inode block will + * run a copy-on-write that might use this cg. + */ + brelse(bp, 0); + bp = NULL; + error = ffs_getblk(ip->i_devvp, fsbtodb(fs, + ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)), + FFS_NOBLK, fs->fs_bsize, false, &ibp); + if (error) + goto fail; + goto retry; + } + } + + cgp->cg_old_time = ufs_rw32(time_second, needswap); + if ((fs->fs_magic != FS_UFS1_MAGIC) || + (fs->fs_old_flags & FS_FLAGS_UPDATED)) + cgp->cg_time = ufs_rw64(time_second, needswap); + inosused = cg_inosused(cgp, needswap); + if (ipref) { + ipref %= fs->fs_ipg; + if (isclr(inosused, ipref)) + goto gotit; + } + start = ufs_rw32(cgp->cg_irotor, needswap) / NBBY; + len = howmany(fs->fs_ipg - ufs_rw32(cgp->cg_irotor, needswap), + NBBY); + loc = skpc(0xff, len, &inosused[start]); + if (loc == 0) { + len = start + 1; + start = 0; + loc = skpc(0xff, len, &inosused[0]); + if (loc == 0) { + printf("cg = %d, irotor = %d, fs = %s\n", + cg, ufs_rw32(cgp->cg_irotor, needswap), + fs->fs_fsmnt); + panic("ffs_nodealloccg: map corrupted"); + /* NOTREACHED */ + } + } + i = start + len - loc; + map = inosused[i] ^ 0xff; + if (map == 0) { + printf("fs = %s\n", fs->fs_fsmnt); + panic("ffs_nodealloccg: block not in map"); + } + ipref = i * NBBY + ffs(map) - 1; + cgp->cg_irotor = ufs_rw32(ipref, needswap); +gotit: + UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref, + mode); + /* + * Check to see if we need to initialize more inodes. + */ + if (ibp != NULL) { + KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap)); + memset(ibp->b_data, 0, fs->fs_bsize); + dp2 = (struct ufs2_dinode *)(ibp->b_data); + for (i = 0; i < INOPB(fs); i++) { + /* + * Don't bother to swap, it's supposed to be + * random, after all. + */ + dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1; + dp2++; + } + initediblk += INOPB(fs); + cgp->cg_initediblk = ufs_rw32(initediblk, needswap); + } + + mutex_enter(&ump->um_lock); + ACTIVECG_CLR(fs, cg); + setbit(inosused, ipref); + ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap); + fs->fs_cstotal.cs_nifree--; + fs->fs_cs(fs, cg).cs_nifree--; + fs->fs_fmod = 1; + if ((mode & IFMT) == IFDIR) { + ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap); + fs->fs_cstotal.cs_ndir++; + fs->fs_cs(fs, cg).cs_ndir++; + } + mutex_exit(&ump->um_lock); + if (ibp != NULL) { + bwrite(bp); + bawrite(ibp); + } else + bdwrite(bp); + return (cg * fs->fs_ipg + ipref); + fail: + if (bp != NULL) + brelse(bp, 0); + if (ibp != NULL) + brelse(ibp, 0); + mutex_enter(&ump->um_lock); + return (0); +} + +/* + * Allocate a block or fragment. + * + * The specified block or fragment is removed from the + * free map, possibly fragmenting a block in the process. + * + * This implementation should mirror fs_blkfree + * + * => um_lock not held on entry or exit + */ +int +ffs_blkalloc(struct inode *ip, daddr_t bno, long size) +{ + int error; + + error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size, + ip->i_dev, ip->i_uid); + if (error) + return error; + + return ffs_blkalloc_ump(ip->i_ump, bno, size); +} + +int +ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size) +{ + struct fs *fs = ump->um_fs; + struct cg *cgp; + struct buf *bp; + int32_t fragno, cgbno; + int i, error, cg, blk, frags, bbase; + u_int8_t *blksfree; + const int needswap = UFS_FSNEEDSWAP(fs); + + KASSERT((u_int)size <= fs->fs_bsize && fragoff(fs, size) == 0 && + fragnum(fs, bno) + numfrags(fs, size) <= fs->fs_frag); + KASSERT(bno < fs->fs_size); + + cg = dtog(fs, bno); + error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + return error; + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp, needswap)) { + brelse(bp, 0); + return EIO; + } + cgp->cg_old_time = ufs_rw32(time_second, needswap); + cgp->cg_time = ufs_rw64(time_second, needswap); + cgbno = dtogd(fs, bno); + blksfree = cg_blksfree(cgp, needswap); + + mutex_enter(&ump->um_lock); + if (size == fs->fs_bsize) { + fragno = fragstoblks(fs, cgbno); + if (!ffs_isblock(fs, blksfree, fragno)) { + mutex_exit(&ump->um_lock); + brelse(bp, 0); + return EBUSY; + } + ffs_clrblock(fs, blksfree, fragno); + ffs_clusteracct(fs, cgp, fragno, -1); + ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); + fs->fs_cstotal.cs_nbfree--; + fs->fs_cs(fs, cg).cs_nbfree--; + } else { + bbase = cgbno - fragnum(fs, cgbno); + + frags = numfrags(fs, size); + for (i = 0; i < frags; i++) { + if (isclr(blksfree, cgbno + i)) { + mutex_exit(&ump->um_lock); + brelse(bp, 0); + return EBUSY; + } + } + /* + * if a complete block is being split, account for it + */ + fragno = fragstoblks(fs, bbase); + if (ffs_isblock(fs, blksfree, fragno)) { + ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap); + fs->fs_cstotal.cs_nffree += fs->fs_frag; + fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag; + ffs_clusteracct(fs, cgp, fragno, -1); + ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); + fs->fs_cstotal.cs_nbfree--; + fs->fs_cs(fs, cg).cs_nbfree--; + } + /* + * decrement the counts associated with the old frags + */ + blk = blkmap(fs, blksfree, bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap); + /* + * allocate the fragment + */ + for (i = 0; i < frags; i++) { + clrbit(blksfree, cgbno + i); + } + ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap); + fs->fs_cstotal.cs_nffree -= i; + fs->fs_cs(fs, cg).cs_nffree -= i; + /* + * add back in counts associated with the new frags + */ + blk = blkmap(fs, blksfree, bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap); + } + fs->fs_fmod = 1; + ACTIVECG_CLR(fs, cg); + mutex_exit(&ump->um_lock); + bdwrite(bp); + return 0; +} + +/* + * Free a block or fragment. + * + * The specified block or fragment is placed back in the + * free map. If a fragment is deallocated, a possible + * block reassembly is checked. + * + * => um_lock not held on entry or exit + */ +void +ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size, + ino_t inum) +{ + struct cg *cgp; + struct buf *bp; + struct ufsmount *ump; + daddr_t cgblkno; + int error, cg; + dev_t dev; + const bool devvp_is_snapshot = (devvp->v_type != VBLK); +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + + KASSERT(!devvp_is_snapshot); + + cg = dtog(fs, bno); + dev = devvp->v_rdev; + ump = VFSTOUFS(devvp->v_specmountpoint); + KASSERT(fs == ump->um_fs); + cgblkno = fsbtodb(fs, cgtod(fs, cg)); + if (ffs_snapblkfree(fs, devvp, bno, size, inum)) + return; + + error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum); + if (error) + return; + + error = bread(devvp, cgblkno, (int)fs->fs_cgsize, + NOCRED, B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + return; + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp, needswap)) { + brelse(bp, 0); + return; + } + + ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot); + + bdwrite(bp); +} + +/* + * Free a block or fragment from a snapshot cg copy. + * + * The specified block or fragment is placed back in the + * free map. If a fragment is deallocated, a possible + * block reassembly is checked. + * + * => um_lock not held on entry or exit + */ +void +ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size, + ino_t inum) +{ + struct cg *cgp; + struct buf *bp; + struct ufsmount *ump; + daddr_t cgblkno; + int error, cg; + dev_t dev; + const bool devvp_is_snapshot = (devvp->v_type != VBLK); +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + + KASSERT(devvp_is_snapshot); + + cg = dtog(fs, bno); + dev = VTOI(devvp)->i_devvp->v_rdev; + ump = VFSTOUFS(devvp->v_mount); + cgblkno = fragstoblks(fs, cgtod(fs, cg)); + + error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum); + if (error) + return; + + error = bread(devvp, cgblkno, (int)fs->fs_cgsize, + NOCRED, B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + return; + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp, needswap)) { + brelse(bp, 0); + return; + } + + ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot); + + bdwrite(bp); +} + +static void +ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev, + struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot) +{ + struct cg *cgp; + int32_t fragno, cgbno; + int i, cg, blk, frags, bbase; + u_int8_t *blksfree; + const int needswap = UFS_FSNEEDSWAP(fs); + + cg = dtog(fs, bno); + cgp = (struct cg *)bp->b_data; + cgp->cg_old_time = ufs_rw32(time_second, needswap); + if ((fs->fs_magic != FS_UFS1_MAGIC) || + (fs->fs_old_flags & FS_FLAGS_UPDATED)) + cgp->cg_time = ufs_rw64(time_second, needswap); + cgbno = dtogd(fs, bno); + blksfree = cg_blksfree(cgp, needswap); + mutex_enter(&ump->um_lock); + if (size == fs->fs_bsize) { + fragno = fragstoblks(fs, cgbno); + if (!ffs_isfreeblock(fs, blksfree, fragno)) { + if (devvp_is_snapshot) { + mutex_exit(&ump->um_lock); + return; + } + printf("dev = 0x%llx, block = %" PRId64 ", fs = %s\n", + (unsigned long long)dev, bno, fs->fs_fsmnt); + panic("blkfree: freeing free block"); + } + ffs_setblock(fs, blksfree, fragno); + ffs_clusteracct(fs, cgp, fragno, 1); + ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap); + fs->fs_cstotal.cs_nbfree++; + fs->fs_cs(fs, cg).cs_nbfree++; + if ((fs->fs_magic == FS_UFS1_MAGIC) && + ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { + i = old_cbtocylno(fs, cgbno); + KASSERT(i >= 0); + KASSERT(i < fs->fs_old_ncyl); + KASSERT(old_cbtorpos(fs, cgbno) >= 0); + KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos); + ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1, + needswap); + ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap); + } + } else { + bbase = cgbno - fragnum(fs, cgbno); + /* + * decrement the counts associated with the old frags + */ + blk = blkmap(fs, blksfree, bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap); + /* + * deallocate the fragment + */ + frags = numfrags(fs, size); + for (i = 0; i < frags; i++) { + if (isset(blksfree, cgbno + i)) { + printf("dev = 0x%llx, block = %" PRId64 + ", fs = %s\n", + (unsigned long long)dev, bno + i, + fs->fs_fsmnt); + panic("blkfree: freeing free frag"); + } + setbit(blksfree, cgbno + i); + } + ufs_add32(cgp->cg_cs.cs_nffree, i, needswap); + fs->fs_cstotal.cs_nffree += i; + fs->fs_cs(fs, cg).cs_nffree += i; + /* + * add back in counts associated with the new frags + */ + blk = blkmap(fs, blksfree, bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap); + /* + * if a complete block has been reassembled, account for it + */ + fragno = fragstoblks(fs, bbase); + if (ffs_isblock(fs, blksfree, fragno)) { + ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap); + fs->fs_cstotal.cs_nffree -= fs->fs_frag; + fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; + ffs_clusteracct(fs, cgp, fragno, 1); + ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap); + fs->fs_cstotal.cs_nbfree++; + fs->fs_cs(fs, cg).cs_nbfree++; + if ((fs->fs_magic == FS_UFS1_MAGIC) && + ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) { + i = old_cbtocylno(fs, bbase); + KASSERT(i >= 0); + KASSERT(i < fs->fs_old_ncyl); + KASSERT(old_cbtorpos(fs, bbase) >= 0); + KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos); + ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, + bbase)], 1, needswap); + ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap); + } + } + } + fs->fs_fmod = 1; + ACTIVECG_CLR(fs, cg); + mutex_exit(&ump->um_lock); +} + +/* + * Free an inode. + */ +int +ffs_vfree(struct vnode *vp, ino_t ino, int mode) +{ + + return ffs_freefile(vp->v_mount, ino, mode); +} + +/* + * Do the actual free operation. + * The specified inode is placed back in the free map. + * + * => um_lock not held on entry or exit + */ +int +ffs_freefile(struct mount *mp, ino_t ino, int mode) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + struct vnode *devvp; + struct cg *cgp; + struct buf *bp; + int error, cg; + daddr_t cgbno; + dev_t dev; +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + + cg = ino_to_cg(fs, ino); + devvp = ump->um_devvp; + dev = devvp->v_rdev; + cgbno = fsbtodb(fs, cgtod(fs, cg)); + + if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) + panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s", + (long long)dev, (unsigned long long)ino, fs->fs_fsmnt); + error = bread(devvp, cgbno, (int)fs->fs_cgsize, + NOCRED, B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + return (error); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp, needswap)) { + brelse(bp, 0); + return (0); + } + + ffs_freefile_common(ump, fs, dev, bp, ino, mode, false); + + bdwrite(bp); + + return 0; +} + +int +ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode) +{ + struct ufsmount *ump; + struct cg *cgp; + struct buf *bp; + int error, cg; + daddr_t cgbno; + dev_t dev; +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + + KASSERT(devvp->v_type != VBLK); + + cg = ino_to_cg(fs, ino); + dev = VTOI(devvp)->i_devvp->v_rdev; + ump = VFSTOUFS(devvp->v_mount); + cgbno = fragstoblks(fs, cgtod(fs, cg)); + if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) + panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s", + (unsigned long long)dev, (unsigned long long)ino, + fs->fs_fsmnt); + error = bread(devvp, cgbno, (int)fs->fs_cgsize, + NOCRED, B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + return (error); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp, needswap)) { + brelse(bp, 0); + return (0); + } + ffs_freefile_common(ump, fs, dev, bp, ino, mode, true); + + bdwrite(bp); + + return 0; +} + +static void +ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev, + struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot) +{ + int cg; + struct cg *cgp; + u_int8_t *inosused; +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + + cg = ino_to_cg(fs, ino); + cgp = (struct cg *)bp->b_data; + cgp->cg_old_time = ufs_rw32(time_second, needswap); + if ((fs->fs_magic != FS_UFS1_MAGIC) || + (fs->fs_old_flags & FS_FLAGS_UPDATED)) + cgp->cg_time = ufs_rw64(time_second, needswap); + inosused = cg_inosused(cgp, needswap); + ino %= fs->fs_ipg; + if (isclr(inosused, ino)) { + printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n", + (unsigned long long)dev, (unsigned long long)ino + + cg * fs->fs_ipg, fs->fs_fsmnt); + if (fs->fs_ronly == 0) + panic("ifree: freeing free inode"); + } + clrbit(inosused, ino); + if (!devvp_is_snapshot) + UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp, + ino + cg * fs->fs_ipg, mode); + if (ino < ufs_rw32(cgp->cg_irotor, needswap)) + cgp->cg_irotor = ufs_rw32(ino, needswap); + ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap); + mutex_enter(&ump->um_lock); + fs->fs_cstotal.cs_nifree++; + fs->fs_cs(fs, cg).cs_nifree++; + if ((mode & IFMT) == IFDIR) { + ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap); + fs->fs_cstotal.cs_ndir--; + fs->fs_cs(fs, cg).cs_ndir--; + } + fs->fs_fmod = 1; + ACTIVECG_CLR(fs, cg); + mutex_exit(&ump->um_lock); +} + +/* + * Check to see if a file is free. + */ +int +ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino) +{ + struct cg *cgp; + struct buf *bp; + daddr_t cgbno; + int ret, cg; + u_int8_t *inosused; + const bool devvp_is_snapshot = (devvp->v_type != VBLK); + + KASSERT(devvp_is_snapshot); + + cg = ino_to_cg(fs, ino); + if (devvp_is_snapshot) + cgbno = fragstoblks(fs, cgtod(fs, cg)); + else + cgbno = fsbtodb(fs, cgtod(fs, cg)); + if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) + return 1; + if (bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, 0, &bp)) { + brelse(bp, 0); + return 1; + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) { + brelse(bp, 0); + return 1; + } + inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs)); + ino %= fs->fs_ipg; + ret = isclr(inosused, ino); + brelse(bp, 0); + return ret; +} + +/* + * Find a block of the specified size in the specified cylinder group. + * + * It is a panic if a request is made to find a block if none are + * available. + */ +static int32_t +ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz) +{ + int32_t bno; + int start, len, loc, i; + int blk, field, subfield, pos; + int ostart, olen; + u_int8_t *blksfree; +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + + /* KASSERT(mutex_owned(&ump->um_lock)); */ + + /* + * find the fragment by searching through the free block + * map for an appropriate bit pattern + */ + if (bpref) + start = dtogd(fs, bpref) / NBBY; + else + start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY; + blksfree = cg_blksfree(cgp, needswap); + len = howmany(fs->fs_fpg, NBBY) - start; + ostart = start; + olen = len; + loc = scanc((u_int)len, + (const u_char *)&blksfree[start], + (const u_char *)fragtbl[fs->fs_frag], + (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1))))); + if (loc == 0) { + len = start + 1; + start = 0; + loc = scanc((u_int)len, + (const u_char *)&blksfree[0], + (const u_char *)fragtbl[fs->fs_frag], + (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1))))); + if (loc == 0) { + printf("start = %d, len = %d, fs = %s\n", + ostart, olen, fs->fs_fsmnt); + printf("offset=%d %ld\n", + ufs_rw32(cgp->cg_freeoff, needswap), + (long)blksfree - (long)cgp); + printf("cg %d\n", cgp->cg_cgx); + panic("ffs_alloccg: map corrupted"); + /* NOTREACHED */ + } + } + bno = (start + len - loc) * NBBY; + cgp->cg_frotor = ufs_rw32(bno, needswap); + /* + * found the byte in the map + * sift through the bits to find the selected frag + */ + for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { + blk = blkmap(fs, blksfree, bno); + blk <<= 1; + field = around[allocsiz]; + subfield = inside[allocsiz]; + for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { + if ((blk & field) == subfield) + return (bno + pos); + field <<= 1; + subfield <<= 1; + } + } + printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt); + panic("ffs_alloccg: block not in map"); + /* return (-1); */ +} + +/* + * Fserr prints the name of a file system with an error diagnostic. + * + * The form of the error message is: + * fs: error message + */ +static void +ffs_fserr(struct fs *fs, u_int uid, const char *cp) +{ + + log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n", + uid, curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp); +} diff --git a/sys/ufs/ffs/ffs_appleufs.c b/sys/ufs/ffs/ffs_appleufs.c new file mode 100644 index 000000000..0067d40e9 --- /dev/null +++ b/sys/ufs/ffs/ffs_appleufs.c @@ -0,0 +1,154 @@ +/* $NetBSD: ffs_appleufs.c,v 1.12 2011/11/19 22:51:31 tls Exp $ */ + +/* + * Copyright (c) 2002 Darrin B. Jewell + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ffs_appleufs.c,v 1.12 2011/11/19 22:51:31 tls Exp $"); + +#include +#include +#if defined(_KERNEL) +#include +#include +#include +#endif + +#include +#include +#include +#include + +#if !defined(_KERNEL) && !defined(STANDALONE) +#include +#include +#include +#include +#include +#include +#define KASSERT(x) assert(x) +#endif + +/* + * this is the same calculation as in_cksum + */ +u_int16_t +ffs_appleufs_cksum(const struct appleufslabel *appleufs) +{ + const u_int16_t *p = (const u_int16_t *)appleufs; + int len = APPLEUFS_LABEL_SIZE; /* sizeof(struct appleufslabel) */ + long res = 0; + while (len > 1) { + res += *p++; + len -= 2; + } +#if 0 /* APPLEUFS_LABEL_SIZE is guaranteed to be even */ + if (len == 1) + res += htobe16(*(u_char *)p<<8); +#endif + res = (res >> 16) + (res & 0xffff); + res += (res >> 16); + return (~res); +} + +/* copies o to n, validating and byteswapping along the way + * returns 0 if ok, EINVAL if not valid + */ +int +ffs_appleufs_validate(const char *name, const struct appleufslabel *o, + struct appleufslabel *n) +{ + struct appleufslabel tmp; + if (!n) n = &tmp; + + if (o->ul_magic != be32toh(APPLEUFS_LABEL_MAGIC)) { + return EINVAL; + } + *n = *o; + n->ul_checksum = 0; + n->ul_checksum = ffs_appleufs_cksum(n); + if (n->ul_checksum != o->ul_checksum) { +#if defined(DIAGNOSTIC) || !defined(_KERNEL) + printf("%s: invalid APPLE UFS checksum. found 0x%x, expecting 0x%x", + name,o->ul_checksum,n->ul_checksum); +#endif + return EINVAL; + } + n->ul_magic = be32toh(o->ul_magic); + n->ul_version = be32toh(o->ul_version); + n->ul_time = be32toh(o->ul_time); + n->ul_namelen = be16toh(o->ul_namelen); + + if (n->ul_namelen > APPLEUFS_MAX_LABEL_NAME) { +#if defined(DIAGNOSTIC) || !defined(_KERNEL) + printf("%s: APPLE UFS label name too long, truncated.\n", + name); +#endif + n->ul_namelen = APPLEUFS_MAX_LABEL_NAME; + } + /* if len is max, will set ul_unused1 */ + n->ul_name[n->ul_namelen - 1] = '\0'; + +#ifdef DEBUG + printf("%s: found APPLE UFS label v%d: \"%s\"\n", + name,n->ul_version,n->ul_name); +#endif + n->ul_uuid = be64toh(o->ul_uuid); + + return 0; +} + +void +ffs_appleufs_set(struct appleufslabel *appleufs, const char *name, time_t t, + uint64_t uuid) +{ + size_t namelen; + if (!name) name = "untitled"; + if (t == ((time_t)-1)) { +#if defined(_KERNEL) + t = time_second; +#elif defined(STANDALONE) + t = 0; +#else + (void)time(&t); +#endif + } + if (uuid == 0) { +#if defined(_KERNEL) && !defined(STANDALONE) + uuid = cprng_fast64(); +#endif + } + namelen = strlen(name); + if (namelen > APPLEUFS_MAX_LABEL_NAME) + namelen = APPLEUFS_MAX_LABEL_NAME; + memset(appleufs, 0, APPLEUFS_LABEL_SIZE); + appleufs->ul_magic = htobe32(APPLEUFS_LABEL_MAGIC); + appleufs->ul_version = htobe32(APPLEUFS_LABEL_VERSION); + appleufs->ul_time = htobe32((u_int32_t)t); + appleufs->ul_namelen = htobe16(namelen); + strncpy(appleufs->ul_name,name,namelen); + appleufs->ul_uuid = htobe64(uuid); + appleufs->ul_checksum = ffs_appleufs_cksum(appleufs); +} diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c new file mode 100644 index 000000000..3683cbb19 --- /dev/null +++ b/sys/ufs/ffs/ffs_balloc.c @@ -0,0 +1,1051 @@ +/* $NetBSD: ffs_balloc.c,v 1.54 2011/04/23 07:36:02 hannken Exp $ */ + +/* + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Marshall + * Kirk McKusick and Network Associates Laboratories, the Security + * Research Division of Network Associates, Inc. under DARPA/SPAWAR + * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS + * research program + * + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ffs_balloc.c,v 1.54 2011/04/23 07:36:02 hannken Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_quota.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include + +static int ffs_balloc_ufs1(struct vnode *, off_t, int, kauth_cred_t, int, + struct buf **); +static int ffs_balloc_ufs2(struct vnode *, off_t, int, kauth_cred_t, int, + struct buf **); + +/* + * Balloc defines the structure of file system storage + * by allocating the physical blocks on a device given + * the inode and the logical block number in a file. + */ + +int +ffs_balloc(struct vnode *vp, off_t off, int size, kauth_cred_t cred, int flags, + struct buf **bpp) +{ + int error; + + if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC) + error = ffs_balloc_ufs2(vp, off, size, cred, flags, bpp); + else + error = ffs_balloc_ufs1(vp, off, size, cred, flags, bpp); + + if (error == 0 && bpp != NULL && (error = fscow_run(*bpp, false)) != 0) + brelse(*bpp, 0); + + return error; +} + +static int +ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred, + int flags, struct buf **bpp) +{ + daddr_t lbn, lastlbn; + struct buf *bp, *nbp; + struct inode *ip = VTOI(vp); + struct fs *fs = ip->i_fs; + struct ufsmount *ump = ip->i_ump; + struct indir indirs[NIADDR + 2]; + daddr_t newb, pref, nb; + int32_t *bap; /* XXX ondisk32 */ + int deallocated, osize, nsize, num, i, error; + int32_t *blkp, *allocblk, allociblk[NIADDR + 1]; + int32_t *allocib; + int unwindidx = -1; +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist); + + lbn = lblkno(fs, off); + size = blkoff(fs, off) + size; + if (size > fs->fs_bsize) + panic("ffs_balloc: blk too big"); + if (bpp != NULL) { + *bpp = NULL; + } + UVMHIST_LOG(ubchist, "vp %p lbn 0x%x size 0x%x", vp, lbn, size,0); + + if (lbn < 0) + return (EFBIG); + + /* + * If the next write will extend the file into a new block, + * and the file is currently composed of a fragment + * this fragment has to be extended to be a full block. + */ + + lastlbn = lblkno(fs, ip->i_size); + if (lastlbn < NDADDR && lastlbn < lbn) { + nb = lastlbn; + osize = blksize(fs, ip, nb); + if (osize < fs->fs_bsize && osize > 0) { + mutex_enter(&ump->um_lock); + error = ffs_realloccg(ip, nb, + ffs_blkpref_ufs1(ip, lastlbn, nb, flags, + &ip->i_ffs1_db[0]), + osize, (int)fs->fs_bsize, cred, bpp, &newb); + if (error) + return (error); + ip->i_size = lblktosize(fs, nb + 1); + ip->i_ffs1_size = ip->i_size; + uvm_vnp_setsize(vp, ip->i_ffs1_size); + ip->i_ffs1_db[nb] = ufs_rw32((u_int32_t)newb, needswap); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (bpp && *bpp) { + if (flags & B_SYNC) + bwrite(*bpp); + else + bawrite(*bpp); + } + } + } + + /* + * The first NDADDR blocks are direct blocks + */ + + if (lbn < NDADDR) { + nb = ufs_rw32(ip->i_ffs1_db[lbn], needswap); + if (nb != 0 && ip->i_size >= lblktosize(fs, lbn + 1)) { + + /* + * The block is an already-allocated direct block + * and the file already extends past this block, + * thus this must be a whole block. + * Just read the block (if requested). + */ + + if (bpp != NULL) { + error = bread(vp, lbn, fs->fs_bsize, NOCRED, + B_MODIFY, bpp); + if (error) { + brelse(*bpp, 0); + return (error); + } + } + return (0); + } + if (nb != 0) { + + /* + * Consider need to reallocate a fragment. + */ + + osize = fragroundup(fs, blkoff(fs, ip->i_size)); + nsize = fragroundup(fs, size); + if (nsize <= osize) { + + /* + * The existing block is already + * at least as big as we want. + * Just read the block (if requested). + */ + + if (bpp != NULL) { + error = bread(vp, lbn, osize, NOCRED, + B_MODIFY, bpp); + if (error) { + brelse(*bpp, 0); + return (error); + } + } + return 0; + } else { + + /* + * The existing block is smaller than we want, + * grow it. + */ + mutex_enter(&ump->um_lock); + error = ffs_realloccg(ip, lbn, + ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags, + &ip->i_ffs1_db[0]), + osize, nsize, cred, bpp, &newb); + if (error) + return (error); + } + } else { + + /* + * the block was not previously allocated, + * allocate a new block or fragment. + */ + + if (ip->i_size < lblktosize(fs, lbn + 1)) + nsize = fragroundup(fs, size); + else + nsize = fs->fs_bsize; + mutex_enter(&ump->um_lock); + error = ffs_alloc(ip, lbn, + ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags, + &ip->i_ffs1_db[0]), + nsize, flags, cred, &newb); + if (error) + return (error); + if (bpp != NULL) { + error = ffs_getblk(vp, lbn, fsbtodb(fs, newb), + nsize, (flags & B_CLRBUF) != 0, bpp); + if (error) + return error; + } + } + ip->i_ffs1_db[lbn] = ufs_rw32((u_int32_t)newb, needswap); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + return (0); + } + + /* + * Determine the number of levels of indirection. + */ + + pref = 0; + if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) + return (error); + + /* + * Fetch the first indirect block allocating if necessary. + */ + + --num; + nb = ufs_rw32(ip->i_ffs1_ib[indirs[0].in_off], needswap); + allocib = NULL; + allocblk = allociblk; + if (nb == 0) { + mutex_enter(&ump->um_lock); + pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY, NULL); + error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + flags | B_METAONLY, cred, &newb); + if (error) + goto fail; + nb = newb; + *allocblk++ = nb; + error = ffs_getblk(vp, indirs[1].in_lbn, fsbtodb(fs, nb), + fs->fs_bsize, true, &bp); + if (error) + goto fail; + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if ((error = bwrite(bp)) != 0) + goto fail; + unwindidx = 0; + allocib = &ip->i_ffs1_ib[indirs[0].in_off]; + *allocib = ufs_rw32(nb, needswap); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + + /* + * Fetch through the indirect blocks, allocating as necessary. + */ + + for (i = 1;;) { + error = bread(vp, + indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, 0, &bp); + if (error) { + brelse(bp, 0); + goto fail; + } + bap = (int32_t *)bp->b_data; /* XXX ondisk32 */ + nb = ufs_rw32(bap[indirs[i].in_off], needswap); + if (i == num) + break; + i++; + if (nb != 0) { + brelse(bp, 0); + continue; + } + if (fscow_run(bp, true) != 0) { + brelse(bp, 0); + goto fail; + } + mutex_enter(&ump->um_lock); + /* Try to keep snapshot indirect blocks contiguous. */ + if (i == num && (ip->i_flags & SF_SNAPSHOT) != 0) + pref = ffs_blkpref_ufs1(ip, lbn, indirs[i-1].in_off, + flags | B_METAONLY, &bap[0]); + if (pref == 0) + pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY, + NULL); + error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + flags | B_METAONLY, cred, &newb); + if (error) { + brelse(bp, 0); + goto fail; + } + nb = newb; + *allocblk++ = nb; + error = ffs_getblk(vp, indirs[i].in_lbn, fsbtodb(fs, nb), + fs->fs_bsize, true, &nbp); + if (error) { + brelse(bp, 0); + goto fail; + } + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if ((error = bwrite(nbp)) != 0) { + brelse(bp, 0); + goto fail; + } + if (unwindidx < 0) + unwindidx = i - 1; + bap[indirs[i - 1].in_off] = ufs_rw32(nb, needswap); + + /* + * If required, write synchronously, otherwise use + * delayed write. + */ + + if (flags & B_SYNC) { + bwrite(bp); + } else { + bdwrite(bp); + } + } + + if (flags & B_METAONLY) { + KASSERT(bpp != NULL); + *bpp = bp; + return (0); + } + + /* + * Get the data block, allocating if necessary. + */ + + if (nb == 0) { + if (fscow_run(bp, true) != 0) { + brelse(bp, 0); + goto fail; + } + mutex_enter(&ump->um_lock); + pref = ffs_blkpref_ufs1(ip, lbn, indirs[num].in_off, flags, + &bap[0]); + error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred, + &newb); + if (error) { + brelse(bp, 0); + goto fail; + } + nb = newb; + *allocblk++ = nb; + if (bpp != NULL) { + error = ffs_getblk(vp, lbn, fsbtodb(fs, nb), + fs->fs_bsize, (flags & B_CLRBUF) != 0, bpp); + if (error) { + brelse(bp, 0); + goto fail; + } + } + bap[indirs[num].in_off] = ufs_rw32(nb, needswap); + if (allocib == NULL && unwindidx < 0) { + unwindidx = i - 1; + } + + /* + * If required, write synchronously, otherwise use + * delayed write. + */ + + if (flags & B_SYNC) { + bwrite(bp); + } else { + bdwrite(bp); + } + return (0); + } + brelse(bp, 0); + if (bpp != NULL) { + if (flags & B_CLRBUF) { + error = bread(vp, lbn, (int)fs->fs_bsize, + NOCRED, B_MODIFY, &nbp); + if (error) { + brelse(nbp, 0); + goto fail; + } + } else { + error = ffs_getblk(vp, lbn, fsbtodb(fs, nb), + fs->fs_bsize, true, &nbp); + if (error) + goto fail; + } + *bpp = nbp; + } + return (0); + +fail: + /* + * If we have failed part way through block allocation, we + * have to deallocate any indirect blocks that we have allocated. + */ + + if (unwindidx >= 0) { + + /* + * First write out any buffers we've created to resolve their + * softdeps. This must be done in reverse order of creation + * so that we resolve the dependencies in one pass. + * Write the cylinder group buffers for these buffers too. + */ + + for (i = num; i >= unwindidx; i--) { + if (i == 0) { + break; + } + if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK, + fs->fs_bsize, false, &bp) != 0) + continue; + if (bp->b_oflags & BO_DELWRI) { + nb = fsbtodb(fs, cgtod(fs, dtog(fs, + dbtofsb(fs, bp->b_blkno)))); + bwrite(bp); + if (ffs_getblk(ip->i_devvp, nb, FFS_NOBLK, + fs->fs_cgsize, false, &bp) != 0) + continue; + if (bp->b_oflags & BO_DELWRI) { + bwrite(bp); + } else { + brelse(bp, BC_INVAL); + } + } else { + brelse(bp, BC_INVAL); + } + } + + /* + * Undo the partial allocation. + */ + if (unwindidx == 0) { + *allocib = 0; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } else { + int r; + + r = bread(vp, indirs[unwindidx].in_lbn, + (int)fs->fs_bsize, NOCRED, 0, &bp); + if (r) { + panic("Could not unwind indirect block, error %d", r); + brelse(bp, 0); + } else { + bap = (int32_t *)bp->b_data; /* XXX ondisk32 */ + bap[indirs[unwindidx].in_off] = 0; + bwrite(bp); + } + } + for (i = unwindidx + 1; i <= num; i++) { + if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK, + fs->fs_bsize, false, &bp) == 0) + brelse(bp, BC_INVAL); + } + } + for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) { + ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number); + deallocated += fs->fs_bsize; + } + if (deallocated) { +#if defined(QUOTA) || defined(QUOTA2) + /* + * Restore user's disk quota because allocation failed. + */ + (void)chkdq(ip, -btodb(deallocated), cred, FORCE); +#endif + ip->i_ffs1_blocks -= btodb(deallocated); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + return (error); +} + +static int +ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred, + int flags, struct buf **bpp) +{ + daddr_t lbn, lastlbn; + struct buf *bp, *nbp; + struct inode *ip = VTOI(vp); + struct fs *fs = ip->i_fs; + struct ufsmount *ump = ip->i_ump; + struct indir indirs[NIADDR + 2]; + daddr_t newb, pref, nb; + int64_t *bap; + int deallocated, osize, nsize, num, i, error; + daddr_t *blkp, *allocblk, allociblk[NIADDR + 1]; + int64_t *allocib; + int unwindidx = -1; +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist); + + lbn = lblkno(fs, off); + size = blkoff(fs, off) + size; + if (size > fs->fs_bsize) + panic("ffs_balloc: blk too big"); + if (bpp != NULL) { + *bpp = NULL; + } + UVMHIST_LOG(ubchist, "vp %p lbn 0x%x size 0x%x", vp, lbn, size,0); + + if (lbn < 0) + return (EFBIG); + +#ifdef notyet + /* + * Check for allocating external data. + */ + if (flags & IO_EXT) { + if (lbn >= NXADDR) + return (EFBIG); + /* + * If the next write will extend the data into a new block, + * and the data is currently composed of a fragment + * this fragment has to be extended to be a full block. + */ + lastlbn = lblkno(fs, dp->di_extsize); + if (lastlbn < lbn) { + nb = lastlbn; + osize = sblksize(fs, dp->di_extsize, nb); + if (osize < fs->fs_bsize && osize > 0) { + mutex_enter(&ump->um_lock); + error = ffs_realloccg(ip, -1 - nb, + dp->di_extb[nb], + ffs_blkpref_ufs2(ip, lastlbn, (int)nb, + flags, &dp->di_extb[0]), + osize, + (int)fs->fs_bsize, cred, &bp); + if (error) + return (error); + dp->di_extsize = smalllblktosize(fs, nb + 1); + dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno); + bp->b_xflags |= BX_ALTDATA; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (flags & IO_SYNC) + bwrite(bp); + else + bawrite(bp); + } + } + /* + * All blocks are direct blocks + */ + if (flags & BA_METAONLY) + panic("ffs_balloc_ufs2: BA_METAONLY for ext block"); + nb = dp->di_extb[lbn]; + if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) { + error = bread(vp, -1 - lbn, fs->fs_bsize, + NOCRED, 0, &bp); + if (error) { + brelse(bp, 0); + return (error); + } + mutex_enter(&bp->b_interlock); + bp->b_blkno = fsbtodb(fs, nb); + bp->b_xflags |= BX_ALTDATA; + mutex_exit(&bp->b_interlock); + *bpp = bp; + return (0); + } + if (nb != 0) { + /* + * Consider need to reallocate a fragment. + */ + osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); + nsize = fragroundup(fs, size); + if (nsize <= osize) { + error = bread(vp, -1 - lbn, osize, + NOCRED, 0, &bp); + if (error) { + brelse(bp, 0); + return (error); + } + mutex_enter(&bp->b_interlock); + bp->b_blkno = fsbtodb(fs, nb); + bp->b_xflags |= BX_ALTDATA; + mutex_exit(&bp->b_interlock); + } else { + mutex_enter(&ump->um_lock); + error = ffs_realloccg(ip, -1 - lbn, + dp->di_extb[lbn], + ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags, + &dp->di_extb[0]), + osize, nsize, cred, &bp); + if (error) + return (error); + bp->b_xflags |= BX_ALTDATA; + } + } else { + if (dp->di_extsize < smalllblktosize(fs, lbn + 1)) + nsize = fragroundup(fs, size); + else + nsize = fs->fs_bsize; + mutex_enter(&ump->um_lock); + error = ffs_alloc(ip, lbn, + ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags, + &dp->di_extb[0]), + nsize, flags, cred, &newb); + if (error) + return (error); + error = ffs_getblk(vp, -1 - lbn, fsbtodb(fs, newb), + nsize, (flags & BA_CLRBUF) != 0, &bp); + if (error) + return error; + bp->b_xflags |= BX_ALTDATA; + } + dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + *bpp = bp; + return (0); + } +#endif + /* + * If the next write will extend the file into a new block, + * and the file is currently composed of a fragment + * this fragment has to be extended to be a full block. + */ + + lastlbn = lblkno(fs, ip->i_size); + if (lastlbn < NDADDR && lastlbn < lbn) { + nb = lastlbn; + osize = blksize(fs, ip, nb); + if (osize < fs->fs_bsize && osize > 0) { + mutex_enter(&ump->um_lock); + error = ffs_realloccg(ip, nb, + ffs_blkpref_ufs2(ip, lastlbn, nb, flags, + &ip->i_ffs2_db[0]), + osize, (int)fs->fs_bsize, cred, bpp, &newb); + if (error) + return (error); + ip->i_size = lblktosize(fs, nb + 1); + ip->i_ffs2_size = ip->i_size; + uvm_vnp_setsize(vp, ip->i_size); + ip->i_ffs2_db[nb] = ufs_rw64(newb, needswap); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (bpp) { + if (flags & B_SYNC) + bwrite(*bpp); + else + bawrite(*bpp); + } + } + } + + /* + * The first NDADDR blocks are direct blocks + */ + + if (lbn < NDADDR) { + nb = ufs_rw64(ip->i_ffs2_db[lbn], needswap); + if (nb != 0 && ip->i_size >= lblktosize(fs, lbn + 1)) { + + /* + * The block is an already-allocated direct block + * and the file already extends past this block, + * thus this must be a whole block. + * Just read the block (if requested). + */ + + if (bpp != NULL) { + error = bread(vp, lbn, fs->fs_bsize, NOCRED, + B_MODIFY, bpp); + if (error) { + brelse(*bpp, 0); + return (error); + } + } + return (0); + } + if (nb != 0) { + + /* + * Consider need to reallocate a fragment. + */ + + osize = fragroundup(fs, blkoff(fs, ip->i_size)); + nsize = fragroundup(fs, size); + if (nsize <= osize) { + + /* + * The existing block is already + * at least as big as we want. + * Just read the block (if requested). + */ + + if (bpp != NULL) { + error = bread(vp, lbn, osize, NOCRED, + B_MODIFY, bpp); + if (error) { + brelse(*bpp, 0); + return (error); + } + } + return 0; + } else { + + /* + * The existing block is smaller than we want, + * grow it. + */ + mutex_enter(&ump->um_lock); + error = ffs_realloccg(ip, lbn, + ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags, + &ip->i_ffs2_db[0]), + osize, nsize, cred, bpp, &newb); + if (error) + return (error); + } + } else { + + /* + * the block was not previously allocated, + * allocate a new block or fragment. + */ + + if (ip->i_size < lblktosize(fs, lbn + 1)) + nsize = fragroundup(fs, size); + else + nsize = fs->fs_bsize; + mutex_enter(&ump->um_lock); + error = ffs_alloc(ip, lbn, + ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags, + &ip->i_ffs2_db[0]), + nsize, flags, cred, &newb); + if (error) + return (error); + if (bpp != NULL) { + error = ffs_getblk(vp, lbn, fsbtodb(fs, newb), + nsize, (flags & B_CLRBUF) != 0, bpp); + if (error) + return error; + } + } + ip->i_ffs2_db[lbn] = ufs_rw64(newb, needswap); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + return (0); + } + + /* + * Determine the number of levels of indirection. + */ + + pref = 0; + if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) + return (error); + + /* + * Fetch the first indirect block allocating if necessary. + */ + + --num; + nb = ufs_rw64(ip->i_ffs2_ib[indirs[0].in_off], needswap); + allocib = NULL; + allocblk = allociblk; + if (nb == 0) { + mutex_enter(&ump->um_lock); + pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY, NULL); + error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + flags | B_METAONLY, cred, &newb); + if (error) + goto fail; + nb = newb; + *allocblk++ = nb; + error = ffs_getblk(vp, indirs[1].in_lbn, fsbtodb(fs, nb), + fs->fs_bsize, true, &bp); + if (error) + goto fail; + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if ((error = bwrite(bp)) != 0) + goto fail; + unwindidx = 0; + allocib = &ip->i_ffs2_ib[indirs[0].in_off]; + *allocib = ufs_rw64(nb, needswap); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + + /* + * Fetch through the indirect blocks, allocating as necessary. + */ + + for (i = 1;;) { + error = bread(vp, + indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, 0, &bp); + if (error) { + brelse(bp, 0); + goto fail; + } + bap = (int64_t *)bp->b_data; + nb = ufs_rw64(bap[indirs[i].in_off], needswap); + if (i == num) + break; + i++; + if (nb != 0) { + brelse(bp, 0); + continue; + } + if (fscow_run(bp, true) != 0) { + brelse(bp, 0); + goto fail; + } + mutex_enter(&ump->um_lock); + /* Try to keep snapshot indirect blocks contiguous. */ + if (i == num && (ip->i_flags & SF_SNAPSHOT) != 0) + pref = ffs_blkpref_ufs2(ip, lbn, indirs[i-1].in_off, + flags | B_METAONLY, &bap[0]); + if (pref == 0) + pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY, + NULL); + error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + flags | B_METAONLY, cred, &newb); + if (error) { + brelse(bp, 0); + goto fail; + } + nb = newb; + *allocblk++ = nb; + error = ffs_getblk(vp, indirs[i].in_lbn, fsbtodb(fs, nb), + fs->fs_bsize, true, &nbp); + if (error) { + brelse(bp, 0); + goto fail; + } + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if ((error = bwrite(nbp)) != 0) { + brelse(bp, 0); + goto fail; + } + if (unwindidx < 0) + unwindidx = i - 1; + bap[indirs[i - 1].in_off] = ufs_rw64(nb, needswap); + + /* + * If required, write synchronously, otherwise use + * delayed write. + */ + + if (flags & B_SYNC) { + bwrite(bp); + } else { + bdwrite(bp); + } + } + + if (flags & B_METAONLY) { + KASSERT(bpp != NULL); + *bpp = bp; + return (0); + } + + /* + * Get the data block, allocating if necessary. + */ + + if (nb == 0) { + if (fscow_run(bp, true) != 0) { + brelse(bp, 0); + goto fail; + } + mutex_enter(&ump->um_lock); + pref = ffs_blkpref_ufs2(ip, lbn, indirs[num].in_off, flags, + &bap[0]); + error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred, + &newb); + if (error) { + brelse(bp, 0); + goto fail; + } + nb = newb; + *allocblk++ = nb; + if (bpp != NULL) { + error = ffs_getblk(vp, lbn, fsbtodb(fs, nb), + fs->fs_bsize, (flags & B_CLRBUF) != 0, bpp); + if (error) { + brelse(bp, 0); + goto fail; + } + } + bap[indirs[num].in_off] = ufs_rw64(nb, needswap); + if (allocib == NULL && unwindidx < 0) { + unwindidx = i - 1; + } + + /* + * If required, write synchronously, otherwise use + * delayed write. + */ + + if (flags & B_SYNC) { + bwrite(bp); + } else { + bdwrite(bp); + } + return (0); + } + brelse(bp, 0); + if (bpp != NULL) { + if (flags & B_CLRBUF) { + error = bread(vp, lbn, (int)fs->fs_bsize, + NOCRED, B_MODIFY, &nbp); + if (error) { + brelse(nbp, 0); + goto fail; + } + } else { + error = ffs_getblk(vp, lbn, fsbtodb(fs, nb), + fs->fs_bsize, true, &nbp); + if (error) + goto fail; + } + *bpp = nbp; + } + return (0); + +fail: + /* + * If we have failed part way through block allocation, we + * have to deallocate any indirect blocks that we have allocated. + */ + + if (unwindidx >= 0) { + + /* + * First write out any buffers we've created to resolve their + * softdeps. This must be done in reverse order of creation + * so that we resolve the dependencies in one pass. + * Write the cylinder group buffers for these buffers too. + */ + + for (i = num; i >= unwindidx; i--) { + if (i == 0) { + break; + } + if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK, + fs->fs_bsize, false, &bp) != 0) + continue; + if (bp->b_oflags & BO_DELWRI) { + nb = fsbtodb(fs, cgtod(fs, dtog(fs, + dbtofsb(fs, bp->b_blkno)))); + bwrite(bp); + if (ffs_getblk(ip->i_devvp, nb, FFS_NOBLK, + fs->fs_cgsize, false, &bp) != 0) + continue; + if (bp->b_oflags & BO_DELWRI) { + bwrite(bp); + } else { + brelse(bp, BC_INVAL); + } + } else { + brelse(bp, BC_INVAL); + } + } + + /* + * Now that any dependencies that we created have been + * resolved, we can undo the partial allocation. + */ + + if (unwindidx == 0) { + *allocib = 0; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } else { + int r; + + r = bread(vp, indirs[unwindidx].in_lbn, + (int)fs->fs_bsize, NOCRED, 0, &bp); + if (r) { + panic("Could not unwind indirect block, error %d", r); + brelse(bp, 0); + } else { + bap = (int64_t *)bp->b_data; + bap[indirs[unwindidx].in_off] = 0; + bwrite(bp); + } + } + for (i = unwindidx + 1; i <= num; i++) { + if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK, + fs->fs_bsize, false, &bp) == 0) + brelse(bp, BC_INVAL); + } + } + for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) { + ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number); + deallocated += fs->fs_bsize; + } + if (deallocated) { +#if defined(QUOTA) || defined(QUOTA2) + /* + * Restore user's disk quota because allocation failed. + */ + (void)chkdq(ip, -btodb(deallocated), cred, FORCE); +#endif + ip->i_ffs2_blocks -= btodb(deallocated); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + + return (error); +} diff --git a/sys/ufs/ffs/ffs_bswap.c b/sys/ufs/ffs/ffs_bswap.c new file mode 100644 index 000000000..ddac30db0 --- /dev/null +++ b/sys/ufs/ffs/ffs_bswap.c @@ -0,0 +1,271 @@ +/* $NetBSD: ffs_bswap.c,v 1.35 2011/03/06 17:08:38 bouyer Exp $ */ + +/* + * Copyright (c) 1998 Manuel Bouyer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +__KERNEL_RCSID(0, "$NetBSD: ffs_bswap.c,v 1.35 2011/03/06 17:08:38 bouyer Exp $"); + +#include +#if defined(_KERNEL) +#include +#endif + +#include +#include +#include +#include +#include + +#if !defined(_KERNEL) +#include +#include +#include +#include +#define panic(x) printf("%s\n", (x)), abort() +#endif + +void +ffs_sb_swap(struct fs *o, struct fs *n) +{ + size_t i; + u_int32_t *o32, *n32; + + /* + * In order to avoid a lot of lines, as the first N fields (52) + * of the superblock up to fs_fmod are u_int32_t, we just loop + * here to convert them. + */ + o32 = (u_int32_t *)o; + n32 = (u_int32_t *)n; + for (i = 0; i < offsetof(struct fs, fs_fmod) / sizeof(u_int32_t); i++) + n32[i] = bswap32(o32[i]); + + n->fs_swuid = bswap64(o->fs_swuid); + n->fs_cgrotor = bswap32(o->fs_cgrotor); /* Unused */ + n->fs_old_cpc = bswap32(o->fs_old_cpc); + + /* These fields overlap with a possible location for the + * historic FS_DYNAMICPOSTBLFMT postbl table, and with the + * first half of the historic FS_42POSTBLFMT postbl table. + */ + n->fs_maxbsize = bswap32(o->fs_maxbsize); + /* XXX journal */ + n->fs_quota_magic = bswap32(o->fs_quota_magic); + for (i = 0; i < MAXQUOTAS; i++) + n->fs_quotafile[i] = bswap64(o->fs_quotafile[i]); + n->fs_sblockloc = bswap64(o->fs_sblockloc); + ffs_csumtotal_swap(&o->fs_cstotal, &n->fs_cstotal); + n->fs_time = bswap64(o->fs_time); + n->fs_size = bswap64(o->fs_size); + n->fs_dsize = bswap64(o->fs_dsize); + n->fs_csaddr = bswap64(o->fs_csaddr); + n->fs_pendingblocks = bswap64(o->fs_pendingblocks); + n->fs_pendinginodes = bswap32(o->fs_pendinginodes); + + /* These fields overlap with the second half of the + * historic FS_42POSTBLFMT postbl table + */ + for (i = 0; i < FSMAXSNAP; i++) + n->fs_snapinum[i] = bswap32(o->fs_snapinum[i]); + n->fs_avgfilesize = bswap32(o->fs_avgfilesize); + n->fs_avgfpdir = bswap32(o->fs_avgfpdir); + /* fs_sparecon[28] - ignore for now */ + n->fs_flags = bswap32(o->fs_flags); + n->fs_contigsumsize = bswap32(o->fs_contigsumsize); + n->fs_maxsymlinklen = bswap32(o->fs_maxsymlinklen); + n->fs_old_inodefmt = bswap32(o->fs_old_inodefmt); + n->fs_maxfilesize = bswap64(o->fs_maxfilesize); + n->fs_qbmask = bswap64(o->fs_qbmask); + n->fs_qfmask = bswap64(o->fs_qfmask); + n->fs_state = bswap32(o->fs_state); + n->fs_old_postblformat = bswap32(o->fs_old_postblformat); + n->fs_old_nrpos = bswap32(o->fs_old_nrpos); + n->fs_old_postbloff = bswap32(o->fs_old_postbloff); + n->fs_old_rotbloff = bswap32(o->fs_old_rotbloff); + + n->fs_magic = bswap32(o->fs_magic); +} + +void +ffs_dinode1_swap(struct ufs1_dinode *o, struct ufs1_dinode *n) +{ + + n->di_mode = bswap16(o->di_mode); + n->di_nlink = bswap16(o->di_nlink); + n->di_u.oldids[0] = bswap16(o->di_u.oldids[0]); + n->di_u.oldids[1] = bswap16(o->di_u.oldids[1]); + n->di_size = bswap64(o->di_size); + n->di_atime = bswap32(o->di_atime); + n->di_atimensec = bswap32(o->di_atimensec); + n->di_mtime = bswap32(o->di_mtime); + n->di_mtimensec = bswap32(o->di_mtimensec); + n->di_ctime = bswap32(o->di_ctime); + n->di_ctimensec = bswap32(o->di_ctimensec); + memcpy(n->di_db, o->di_db, (NDADDR + NIADDR) * sizeof(u_int32_t)); + n->di_flags = bswap32(o->di_flags); + n->di_blocks = bswap32(o->di_blocks); + n->di_gen = bswap32(o->di_gen); + n->di_uid = bswap32(o->di_uid); + n->di_gid = bswap32(o->di_gid); +} + +void +ffs_dinode2_swap(struct ufs2_dinode *o, struct ufs2_dinode *n) +{ + n->di_mode = bswap16(o->di_mode); + n->di_nlink = bswap16(o->di_nlink); + n->di_uid = bswap32(o->di_uid); + n->di_gid = bswap32(o->di_gid); + n->di_blksize = bswap32(o->di_blksize); + n->di_size = bswap64(o->di_size); + n->di_blocks = bswap64(o->di_blocks); + n->di_atime = bswap64(o->di_atime); + n->di_atimensec = bswap32(o->di_atimensec); + n->di_mtime = bswap64(o->di_mtime); + n->di_mtimensec = bswap32(o->di_mtimensec); + n->di_ctime = bswap64(o->di_ctime); + n->di_ctimensec = bswap32(o->di_ctimensec); + n->di_birthtime = bswap64(o->di_birthtime); + n->di_birthnsec = bswap32(o->di_birthnsec); + n->di_gen = bswap32(o->di_gen); + n->di_kernflags = bswap32(o->di_kernflags); + n->di_flags = bswap32(o->di_flags); + n->di_extsize = bswap32(o->di_extsize); + memcpy(n->di_extb, o->di_extb, (NXADDR + NDADDR + NIADDR) * 8); +} + +void +ffs_csum_swap(struct csum *o, struct csum *n, int size) +{ + size_t i; + u_int32_t *oint, *nint; + + oint = (u_int32_t*)o; + nint = (u_int32_t*)n; + + for (i = 0; i < size / sizeof(u_int32_t); i++) + nint[i] = bswap32(oint[i]); +} + +void +ffs_csumtotal_swap(struct csum_total *o, struct csum_total *n) +{ + n->cs_ndir = bswap64(o->cs_ndir); + n->cs_nbfree = bswap64(o->cs_nbfree); + n->cs_nifree = bswap64(o->cs_nifree); + n->cs_nffree = bswap64(o->cs_nffree); +} + +/* + * Note that ffs_cg_swap may be called with o == n. + */ +void +ffs_cg_swap(struct cg *o, struct cg *n, struct fs *fs) +{ + int i; + u_int32_t *n32, *o32; + u_int16_t *n16, *o16; + int32_t btotoff, boff, clustersumoff; + + n->cg_firstfield = bswap32(o->cg_firstfield); + n->cg_magic = bswap32(o->cg_magic); + n->cg_old_time = bswap32(o->cg_old_time); + n->cg_cgx = bswap32(o->cg_cgx); + n->cg_old_ncyl = bswap16(o->cg_old_ncyl); + n->cg_old_niblk = bswap16(o->cg_old_niblk); + n->cg_ndblk = bswap32(o->cg_ndblk); + n->cg_cs.cs_ndir = bswap32(o->cg_cs.cs_ndir); + n->cg_cs.cs_nbfree = bswap32(o->cg_cs.cs_nbfree); + n->cg_cs.cs_nifree = bswap32(o->cg_cs.cs_nifree); + n->cg_cs.cs_nffree = bswap32(o->cg_cs.cs_nffree); + n->cg_rotor = bswap32(o->cg_rotor); + n->cg_frotor = bswap32(o->cg_frotor); + n->cg_irotor = bswap32(o->cg_irotor); + for (i = 0; i < MAXFRAG; i++) + n->cg_frsum[i] = bswap32(o->cg_frsum[i]); + + if ((fs->fs_magic != FS_UFS2_MAGIC) && + (fs->fs_old_postblformat == FS_42POSTBLFMT)) { /* old format */ + struct ocg *on, *oo; + int j; + on = (struct ocg *)n; + oo = (struct ocg *)o; + + for (i = 0; i < 32; i++) { + on->cg_btot[i] = bswap32(oo->cg_btot[i]); + for (j = 0; j < 8; j++) + on->cg_b[i][j] = bswap16(oo->cg_b[i][j]); + } + memmove(on->cg_iused, oo->cg_iused, 256); + on->cg_magic = bswap32(oo->cg_magic); + } else { /* new format */ + + n->cg_old_btotoff = bswap32(o->cg_old_btotoff); + n->cg_old_boff = bswap32(o->cg_old_boff); + n->cg_iusedoff = bswap32(o->cg_iusedoff); + n->cg_freeoff = bswap32(o->cg_freeoff); + n->cg_nextfreeoff = bswap32(o->cg_nextfreeoff); + n->cg_clustersumoff = bswap32(o->cg_clustersumoff); + n->cg_clusteroff = bswap32(o->cg_clusteroff); + n->cg_nclusterblks = bswap32(o->cg_nclusterblks); + n->cg_niblk = bswap32(o->cg_niblk); + n->cg_initediblk = bswap32(o->cg_initediblk); + n->cg_time = bswap64(o->cg_time); + + if (n->cg_magic == CG_MAGIC) { + btotoff = n->cg_old_btotoff; + boff = n->cg_old_boff; + clustersumoff = n->cg_clustersumoff; + } else { + btotoff = bswap32(n->cg_old_btotoff); + boff = bswap32(n->cg_old_boff); + clustersumoff = bswap32(n->cg_clustersumoff); + } + + n32 = (u_int32_t *)((u_int8_t *)n + clustersumoff); + o32 = (u_int32_t *)((u_int8_t *)o + clustersumoff); + for (i = 1; i < fs->fs_contigsumsize + 1; i++) + n32[i] = bswap32(o32[i]); + + if (fs->fs_magic == FS_UFS2_MAGIC) + return; + + n32 = (u_int32_t *)((u_int8_t *)n + btotoff); + o32 = (u_int32_t *)((u_int8_t *)o + btotoff); + n16 = (u_int16_t *)((u_int8_t *)n + boff); + o16 = (u_int16_t *)((u_int8_t *)o + boff); + + for (i = 0; i < fs->fs_old_cpg; i++) + n32[i] = bswap32(o32[i]); + + for (i = 0; i < fs->fs_old_cpg * fs->fs_old_nrpos; i++) + n16[i] = bswap16(o16[i]); + } +} diff --git a/include/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h similarity index 100% rename from include/ufs/ffs/ffs_extern.h rename to sys/ufs/ffs/ffs_extern.h diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c new file mode 100644 index 000000000..0f6edcb7d --- /dev/null +++ b/sys/ufs/ffs/ffs_inode.c @@ -0,0 +1,725 @@ +/* $NetBSD: ffs_inode.c,v 1.108 2011/11/23 19:42:10 bouyer Exp $ */ + +/*- + * Copyright (c) 2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_inode.c 8.13 (Berkeley) 4/21/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ffs_inode.c,v 1.108 2011/11/23 19:42:10 bouyer Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_ffs.h" +#include "opt_quota.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +static int ffs_indirtrunc(struct inode *, daddr_t, daddr_t, daddr_t, int, + int64_t *); + +/* + * Update the access, modified, and inode change times as specified + * by the IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. + * The IN_MODIFIED flag is used to specify that the inode needs to be + * updated but that the times have already been set. The access + * and modified times are taken from the second and third parameters; + * the inode change time is always taken from the current time. If + * UPDATE_WAIT flag is set, or UPDATE_DIROP is set then wait for the + * disk write of the inode to complete. + */ + +int +ffs_update(struct vnode *vp, const struct timespec *acc, + const struct timespec *mod, int updflags) +{ + struct fs *fs; + struct buf *bp; + struct inode *ip; + int error; + void *cp; + int waitfor, flags; + + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (0); + ip = VTOI(vp); + FFS_ITIMES(ip, acc, mod, NULL); + if (updflags & UPDATE_CLOSE) + flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED); + else + flags = ip->i_flag & IN_MODIFIED; + if (flags == 0) + return (0); + fs = ip->i_fs; + + if ((flags & IN_MODIFIED) != 0 && + (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) { + waitfor = updflags & UPDATE_WAIT; + if ((updflags & UPDATE_DIROP) != 0) + waitfor |= UPDATE_WAIT; + } else + waitfor = 0; + + /* + * Ensure that uid and gid are correct. This is a temporary + * fix until fsck has been changed to do the update. + */ + if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */ + fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */ + ip->i_ffs1_ouid = ip->i_uid; /* XXX */ + ip->i_ffs1_ogid = ip->i_gid; /* XXX */ + } /* XXX */ + error = bread(ip->i_devvp, + fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->fs_bsize, NOCRED, B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + return (error); + } + ip->i_flag &= ~(IN_MODIFIED | IN_ACCESSED); + /* Keep unlinked inode list up to date */ + KDASSERT(DIP(ip, nlink) == ip->i_nlink); + if (ip->i_mode) { + if (ip->i_nlink > 0) { + UFS_WAPBL_UNREGISTER_INODE(ip->i_ump->um_mountp, + ip->i_number, ip->i_mode); + } else { + UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, + ip->i_number, ip->i_mode); + } + } + if (fs->fs_magic == FS_UFS1_MAGIC) { + cp = (char *)bp->b_data + + (ino_to_fsbo(fs, ip->i_number) * DINODE1_SIZE); +#ifdef FFS_EI + if (UFS_FSNEEDSWAP(fs)) + ffs_dinode1_swap(ip->i_din.ffs1_din, + (struct ufs1_dinode *)cp); + else +#endif + memcpy(cp, ip->i_din.ffs1_din, DINODE1_SIZE); + } else { + cp = (char *)bp->b_data + + (ino_to_fsbo(fs, ip->i_number) * DINODE2_SIZE); +#ifdef FFS_EI + if (UFS_FSNEEDSWAP(fs)) + ffs_dinode2_swap(ip->i_din.ffs2_din, + (struct ufs2_dinode *)cp); + else +#endif + memcpy(cp, ip->i_din.ffs2_din, DINODE2_SIZE); + } + if (waitfor) { + return (bwrite(bp)); + } else { + bdwrite(bp); + return (0); + } +} + +#define SINGLE 0 /* index of single indirect block */ +#define DOUBLE 1 /* index of double indirect block */ +#define TRIPLE 2 /* index of triple indirect block */ +/* + * Truncate the inode oip to at most length size, freeing the + * disk blocks. + */ +int +ffs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred) +{ + daddr_t lastblock; + struct inode *oip = VTOI(ovp); + daddr_t bn, lastiblock[NIADDR], indir_lbn[NIADDR]; + daddr_t blks[NDADDR + NIADDR]; + struct fs *fs; + int offset, pgoffset, level; + int64_t count, blocksreleased = 0; + int i, aflag, nblocks; + int error, allerror = 0; + off_t osize; + int sync; + struct ufsmount *ump = oip->i_ump; + + if (ovp->v_type == VCHR || ovp->v_type == VBLK || + ovp->v_type == VFIFO || ovp->v_type == VSOCK) { + KASSERT(oip->i_size == 0); + return 0; + } + + if (length < 0) + return (EINVAL); + + if (ovp->v_type == VLNK && + (oip->i_size < ump->um_maxsymlinklen || + (ump->um_maxsymlinklen == 0 && DIP(oip, blocks) == 0))) { + KDASSERT(length == 0); + memset(SHORTLINK(oip), 0, (size_t)oip->i_size); + oip->i_size = 0; + DIP_ASSIGN(oip, size, 0); + oip->i_flag |= IN_CHANGE | IN_UPDATE; + return (ffs_update(ovp, NULL, NULL, 0)); + } + if (oip->i_size == length) { + /* still do a uvm_vnp_setsize() as writesize may be larger */ + uvm_vnp_setsize(ovp, length); + oip->i_flag |= IN_CHANGE | IN_UPDATE; + return (ffs_update(ovp, NULL, NULL, 0)); + } + fs = oip->i_fs; + if (length > ump->um_maxfilesize) + return (EFBIG); + + if ((oip->i_flags & SF_SNAPSHOT) != 0) + ffs_snapremove(ovp); + + osize = oip->i_size; + aflag = ioflag & IO_SYNC ? B_SYNC : 0; + + /* + * Lengthen the size of the file. We must ensure that the + * last byte of the file is allocated. Since the smallest + * value of osize is 0, length will be at least 1. + */ + + if (osize < length) { + if (lblkno(fs, osize) < NDADDR && + lblkno(fs, osize) != lblkno(fs, length) && + blkroundup(fs, osize) != osize) { + off_t eob; + + eob = blkroundup(fs, osize); + uvm_vnp_setwritesize(ovp, eob); + error = ufs_balloc_range(ovp, osize, eob - osize, + cred, aflag); + if (error) { + (void) ffs_truncate(ovp, osize, + ioflag & IO_SYNC, cred); + return error; + } + if (ioflag & IO_SYNC) { + mutex_enter(ovp->v_interlock); + VOP_PUTPAGES(ovp, + trunc_page(osize & fs->fs_bmask), + round_page(eob), PGO_CLEANIT | PGO_SYNCIO | + PGO_JOURNALLOCKED); + } + } + uvm_vnp_setwritesize(ovp, length); + error = ufs_balloc_range(ovp, length - 1, 1, cred, aflag); + if (error) { + (void) ffs_truncate(ovp, osize, ioflag & IO_SYNC, cred); + return (error); + } + uvm_vnp_setsize(ovp, length); + oip->i_flag |= IN_CHANGE | IN_UPDATE; + KASSERT(ovp->v_size == oip->i_size); + return (ffs_update(ovp, NULL, NULL, 0)); + } + + /* + * When truncating a regular file down to a non-block-aligned size, + * we must zero the part of last block which is past the new EOF. + * We must synchronously flush the zeroed pages to disk + * since the new pages will be invalidated as soon as we + * inform the VM system of the new, smaller size. + * We must do this before acquiring the GLOCK, since fetching + * the pages will acquire the GLOCK internally. + * So there is a window where another thread could see a whole + * zeroed page past EOF, but that's life. + */ + + offset = blkoff(fs, length); + pgoffset = length & PAGE_MASK; + if (ovp->v_type == VREG && (pgoffset != 0 || offset != 0) && + osize > length) { + daddr_t lbn; + voff_t eoz; + int size; + + if (offset != 0) { + error = ufs_balloc_range(ovp, length - 1, 1, cred, + aflag); + if (error) + return error; + } + lbn = lblkno(fs, length); + size = blksize(fs, oip, lbn); + eoz = MIN(MAX(lblktosize(fs, lbn) + size, round_page(pgoffset)), + osize); + ubc_zerorange(&ovp->v_uobj, length, eoz - length, + UBC_UNMAP_FLAG(ovp)); + if (round_page(eoz) > round_page(length)) { + mutex_enter(ovp->v_interlock); + error = VOP_PUTPAGES(ovp, round_page(length), + round_page(eoz), + PGO_CLEANIT | PGO_DEACTIVATE | PGO_JOURNALLOCKED | + ((ioflag & IO_SYNC) ? PGO_SYNCIO : 0)); + if (error) + return error; + } + } + + genfs_node_wrlock(ovp); + oip->i_size = length; + DIP_ASSIGN(oip, size, length); + uvm_vnp_setsize(ovp, length); + /* + * Calculate index into inode's block list of + * last direct and indirect blocks (if any) + * which we want to keep. Lastblock is -1 when + * the file is truncated to 0. + */ + lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; + lastiblock[SINGLE] = lastblock - NDADDR; + lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); + lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); + nblocks = btodb(fs->fs_bsize); + /* + * Update file and block pointers on disk before we start freeing + * blocks. If we crash before free'ing blocks below, the blocks + * will be returned to the free list. lastiblock values are also + * normalized to -1 for calls to ffs_indirtrunc below. + */ + sync = 0; + for (level = TRIPLE; level >= SINGLE; level--) { + blks[NDADDR + level] = DIP(oip, ib[level]); + if (lastiblock[level] < 0 && blks[NDADDR + level] != 0) { + sync = 1; + DIP_ASSIGN(oip, ib[level], 0); + lastiblock[level] = -1; + } + } + for (i = 0; i < NDADDR; i++) { + blks[i] = DIP(oip, db[i]); + if (i > lastblock && blks[i] != 0) { + sync = 1; + DIP_ASSIGN(oip, db[i], 0); + } + } + oip->i_flag |= IN_CHANGE | IN_UPDATE; + if (sync) { + error = ffs_update(ovp, NULL, NULL, UPDATE_WAIT); + if (error && !allerror) + allerror = error; + } + + /* + * Having written the new inode to disk, save its new configuration + * and put back the old block pointers long enough to process them. + * Note that we save the new block configuration so we can check it + * when we are done. + */ + for (i = 0; i < NDADDR; i++) { + bn = DIP(oip, db[i]); + DIP_ASSIGN(oip, db[i], blks[i]); + blks[i] = bn; + } + for (i = 0; i < NIADDR; i++) { + bn = DIP(oip, ib[i]); + DIP_ASSIGN(oip, ib[i], blks[NDADDR + i]); + blks[NDADDR + i] = bn; + } + + oip->i_size = osize; + DIP_ASSIGN(oip, size, osize); + error = vtruncbuf(ovp, lastblock + 1, 0, 0); + if (error && !allerror) + allerror = error; + + /* + * Indirect blocks first. + */ + indir_lbn[SINGLE] = -NDADDR; + indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1; + indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; + for (level = TRIPLE; level >= SINGLE; level--) { + if (oip->i_ump->um_fstype == UFS1) + bn = ufs_rw32(oip->i_ffs1_ib[level],UFS_FSNEEDSWAP(fs)); + else + bn = ufs_rw64(oip->i_ffs2_ib[level],UFS_FSNEEDSWAP(fs)); + if (bn != 0) { + error = ffs_indirtrunc(oip, indir_lbn[level], + fsbtodb(fs, bn), lastiblock[level], level, &count); + if (error) + allerror = error; + blocksreleased += count; + if (lastiblock[level] < 0) { + DIP_ASSIGN(oip, ib[level], 0); + if (oip->i_ump->um_mountp->mnt_wapbl) { + UFS_WAPBL_REGISTER_DEALLOCATION( + oip->i_ump->um_mountp, + fsbtodb(fs, bn), fs->fs_bsize); + } else + ffs_blkfree(fs, oip->i_devvp, bn, + fs->fs_bsize, oip->i_number); + blocksreleased += nblocks; + } + } + if (lastiblock[level] >= 0) + goto done; + } + + /* + * All whole direct blocks or frags. + */ + for (i = NDADDR - 1; i > lastblock; i--) { + long bsize; + + if (oip->i_ump->um_fstype == UFS1) + bn = ufs_rw32(oip->i_ffs1_db[i], UFS_FSNEEDSWAP(fs)); + else + bn = ufs_rw64(oip->i_ffs2_db[i], UFS_FSNEEDSWAP(fs)); + if (bn == 0) + continue; + DIP_ASSIGN(oip, db[i], 0); + bsize = blksize(fs, oip, i); + if ((oip->i_ump->um_mountp->mnt_wapbl) && + (ovp->v_type != VREG)) { + UFS_WAPBL_REGISTER_DEALLOCATION(oip->i_ump->um_mountp, + fsbtodb(fs, bn), bsize); + } else + ffs_blkfree(fs, oip->i_devvp, bn, bsize, oip->i_number); + blocksreleased += btodb(bsize); + } + if (lastblock < 0) + goto done; + + /* + * Finally, look for a change in size of the + * last direct block; release any frags. + */ + if (oip->i_ump->um_fstype == UFS1) + bn = ufs_rw32(oip->i_ffs1_db[lastblock], UFS_FSNEEDSWAP(fs)); + else + bn = ufs_rw64(oip->i_ffs2_db[lastblock], UFS_FSNEEDSWAP(fs)); + if (bn != 0) { + long oldspace, newspace; + + /* + * Calculate amount of space we're giving + * back as old block size minus new block size. + */ + oldspace = blksize(fs, oip, lastblock); + oip->i_size = length; + DIP_ASSIGN(oip, size, length); + newspace = blksize(fs, oip, lastblock); + if (newspace == 0) + panic("itrunc: newspace"); + if (oldspace - newspace > 0) { + /* + * Block number of space to be free'd is + * the old block # plus the number of frags + * required for the storage we're keeping. + */ + bn += numfrags(fs, newspace); + if ((oip->i_ump->um_mountp->mnt_wapbl) && + (ovp->v_type != VREG)) { + UFS_WAPBL_REGISTER_DEALLOCATION( + oip->i_ump->um_mountp, fsbtodb(fs, bn), + oldspace - newspace); + } else + ffs_blkfree(fs, oip->i_devvp, bn, + oldspace - newspace, oip->i_number); + blocksreleased += btodb(oldspace - newspace); + } + } + +done: +#ifdef DIAGNOSTIC + for (level = SINGLE; level <= TRIPLE; level++) + if (blks[NDADDR + level] != DIP(oip, ib[level])) + panic("itrunc1"); + for (i = 0; i < NDADDR; i++) + if (blks[i] != DIP(oip, db[i])) + panic("itrunc2"); + if (length == 0 && + (!LIST_EMPTY(&ovp->v_cleanblkhd) || !LIST_EMPTY(&ovp->v_dirtyblkhd))) + panic("itrunc3"); +#endif /* DIAGNOSTIC */ + /* + * Put back the real size. + */ + oip->i_size = length; + DIP_ASSIGN(oip, size, length); + DIP_ADD(oip, blocks, -blocksreleased); + genfs_node_unlock(ovp); + oip->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(ovp, NULL, NULL, 0); +#if defined(QUOTA) || defined(QUOTA2) + (void) chkdq(oip, -blocksreleased, NOCRED, 0); +#endif + KASSERT(ovp->v_type != VREG || ovp->v_size == oip->i_size); + return (allerror); +} + +/* + * Release blocks associated with the inode ip and stored in the indirect + * block bn. Blocks are free'd in LIFO order up to (but not including) + * lastbn. If level is greater than SINGLE, the block is an indirect block + * and recursive calls to indirtrunc must be used to cleanse other indirect + * blocks. + * + * NB: triple indirect blocks are untested. + */ +static int +ffs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn, + int level, int64_t *countp) +{ + int i; + struct buf *bp; + struct fs *fs = ip->i_fs; + int32_t *bap1 = NULL; + int64_t *bap2 = NULL; + struct vnode *vp; + daddr_t nb, nlbn, last; + char *copy = NULL; + int64_t blkcount, factor, blocksreleased = 0; + int nblocks; + int error = 0, allerror = 0; +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif +#define RBAP(ip, i) (((ip)->i_ump->um_fstype == UFS1) ? \ + ufs_rw32(bap1[i], needswap) : ufs_rw64(bap2[i], needswap)) +#define BAP_ASSIGN(ip, i, value) \ + do { \ + if ((ip)->i_ump->um_fstype == UFS1) \ + bap1[i] = (value); \ + else \ + bap2[i] = (value); \ + } while(0) + + /* + * Calculate index in current block of last + * block to be kept. -1 indicates the entire + * block so we need not calculate the index. + */ + factor = 1; + for (i = SINGLE; i < level; i++) + factor *= NINDIR(fs); + last = lastbn; + if (lastbn > 0) + last /= factor; + nblocks = btodb(fs->fs_bsize); + /* + * Get buffer of block pointers, zero those entries corresponding + * to blocks to be free'd, and update on disk copy first. Since + * double(triple) indirect before single(double) indirect, calls + * to bmap on these blocks will fail. However, we already have + * the on disk address, so we have to set the b_blkno field + * explicitly instead of letting bread do everything for us. + */ + vp = ITOV(ip); + error = ffs_getblk(vp, lbn, FFS_NOBLK, fs->fs_bsize, false, &bp); + if (error) { + *countp = 0; + return error; + } + if (bp->b_oflags & (BO_DONE | BO_DELWRI)) { + /* Braces must be here in case trace evaluates to nothing. */ + trace(TR_BREADHIT, pack(vp, fs->fs_bsize), lbn); + } else { + trace(TR_BREADMISS, pack(vp, fs->fs_bsize), lbn); + curlwp->l_ru.ru_inblock++; /* pay for read */ + bp->b_flags |= B_READ; + bp->b_flags &= ~B_COWDONE; /* we change blkno below */ + if (bp->b_bcount > bp->b_bufsize) + panic("ffs_indirtrunc: bad buffer size"); + bp->b_blkno = dbn; + BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); + VOP_STRATEGY(vp, bp); + error = biowait(bp); + if (error == 0) + error = fscow_run(bp, true); + } + if (error) { + brelse(bp, 0); + *countp = 0; + return (error); + } + + if (ip->i_ump->um_fstype == UFS1) + bap1 = (int32_t *)bp->b_data; + else + bap2 = (int64_t *)bp->b_data; + if (lastbn >= 0) { + copy = malloc(fs->fs_bsize, M_TEMP, M_WAITOK); + memcpy((void *)copy, bp->b_data, (u_int)fs->fs_bsize); + for (i = last + 1; i < NINDIR(fs); i++) + BAP_ASSIGN(ip, i, 0); + error = bwrite(bp); + if (error) + allerror = error; + if (ip->i_ump->um_fstype == UFS1) + bap1 = (int32_t *)copy; + else + bap2 = (int64_t *)copy; + } + + /* + * Recursively free totally unused blocks. + */ + for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; + i--, nlbn += factor) { + nb = RBAP(ip, i); + if (nb == 0) + continue; + if (level > SINGLE) { + error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), + (daddr_t)-1, level - 1, + &blkcount); + if (error) + allerror = error; + blocksreleased += blkcount; + } + if ((ip->i_ump->um_mountp->mnt_wapbl) && + ((level > SINGLE) || (ITOV(ip)->v_type != VREG))) { + UFS_WAPBL_REGISTER_DEALLOCATION(ip->i_ump->um_mountp, + fsbtodb(fs, nb), fs->fs_bsize); + } else + ffs_blkfree(fs, ip->i_devvp, nb, fs->fs_bsize, + ip->i_number); + blocksreleased += nblocks; + } + + /* + * Recursively free last partial block. + */ + if (level > SINGLE && lastbn >= 0) { + last = lastbn % factor; + nb = RBAP(ip, i); + if (nb != 0) { + error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), + last, level - 1, &blkcount); + if (error) + allerror = error; + blocksreleased += blkcount; + } + } + + if (copy != NULL) { + free(copy, M_TEMP); + } else { + brelse(bp, BC_INVAL); + } + + *countp = blocksreleased; + return (allerror); +} + +void +ffs_itimes(struct inode *ip, const struct timespec *acc, + const struct timespec *mod, const struct timespec *cre) +{ + struct timespec now; + + if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) { + return; + } + + vfs_timestamp(&now); + if (ip->i_flag & IN_ACCESS) { + if (acc == NULL) + acc = &now; + DIP_ASSIGN(ip, atime, acc->tv_sec); + DIP_ASSIGN(ip, atimensec, acc->tv_nsec); + } + if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) { + if ((ip->i_flags & SF_SNAPSHOT) == 0) { + if (mod == NULL) + mod = &now; + DIP_ASSIGN(ip, mtime, mod->tv_sec); + DIP_ASSIGN(ip, mtimensec, mod->tv_nsec); + } + ip->i_modrev++; + } + if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) { + if (cre == NULL) + cre = &now; + DIP_ASSIGN(ip, ctime, cre->tv_sec); + DIP_ASSIGN(ip, ctimensec, cre->tv_nsec); + } + if (ip->i_flag & (IN_ACCESS | IN_MODIFY)) + ip->i_flag |= IN_ACCESSED; + if (ip->i_flag & (IN_UPDATE | IN_CHANGE)) + ip->i_flag |= IN_MODIFIED; + ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY); +} diff --git a/sys/ufs/ffs/ffs_quota2.c b/sys/ufs/ffs/ffs_quota2.c new file mode 100644 index 000000000..b3d45b38d --- /dev/null +++ b/sys/ufs/ffs/ffs_quota2.c @@ -0,0 +1,118 @@ +/* $NetBSD: ffs_quota2.c,v 1.4 2011/06/12 03:36:00 rmind Exp $ */ +/*- + * Copyright (c) 2010 Manuel Bouyer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ffs_quota2.c,v 1.4 2011/06/12 03:36:00 rmind Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +int +ffs_quota2_mount(struct mount *mp) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + int error = 0; + struct vnode *vp; + struct lwp *l = curlwp; + + if ((fs->fs_flags & FS_DOQUOTA2) == 0) + return 0; + + ump->um_flags |= UFS_QUOTA2; + ump->umq2_bsize = fs->fs_bsize; + ump->umq2_bmask = fs->fs_qbmask; + if (fs->fs_quota_magic != Q2_HEAD_MAGIC) { + printf("%s: Invalid quota magic number\n", + mp->mnt_stat.f_mntonname); + return EINVAL; + } + if ((fs->fs_quota_flags & FS_Q2_DO_TYPE(USRQUOTA)) && + fs->fs_quotafile[USRQUOTA] == 0) { + printf("%s: no user quota inode\n", + mp->mnt_stat.f_mntonname); + error = EINVAL; + } + if ((fs->fs_quota_flags & FS_Q2_DO_TYPE(GRPQUOTA)) && + fs->fs_quotafile[GRPQUOTA] == 0) { + printf("%s: no group quota inode\n", + mp->mnt_stat.f_mntonname); + error = EINVAL; + } + if (error) + return error; + + if (fs->fs_quota_flags & FS_Q2_DO_TYPE(USRQUOTA) && + ump->um_quotas[USRQUOTA] == NULLVP) { + error = VFS_VGET(mp, fs->fs_quotafile[USRQUOTA], &vp); + if (error) { + printf("%s: can't vget() user quota inode: %d\n", + mp->mnt_stat.f_mntonname, error); + return error; + } + ump->um_quotas[USRQUOTA] = vp; + ump->um_cred[USRQUOTA] = l->l_cred; + mutex_enter(vp->v_interlock); + vp->v_writecount++; + mutex_exit(vp->v_interlock); + VOP_UNLOCK(vp); + } + if (fs->fs_quota_flags & FS_Q2_DO_TYPE(GRPQUOTA) && + ump->um_quotas[GRPQUOTA] == NULLVP) { + error = VFS_VGET(mp, fs->fs_quotafile[GRPQUOTA], &vp); + if (error) { + vn_close(ump->um_quotas[USRQUOTA], + FREAD|FWRITE, l->l_cred); + printf("%s: can't vget() group quota inode: %d\n", + mp->mnt_stat.f_mntonname, error); + return error; + } + ump->um_quotas[GRPQUOTA] = vp; + ump->um_cred[GRPQUOTA] = l->l_cred; + mutex_enter(vp->v_interlock); + vp->v_vflag |= VV_SYSTEM; + vp->v_writecount++; + mutex_exit(vp->v_interlock); + VOP_UNLOCK(vp); + } + mp->mnt_flag |= MNT_QUOTA; + return 0; +} diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c new file mode 100644 index 000000000..b1e07c11c --- /dev/null +++ b/sys/ufs/ffs/ffs_snapshot.c @@ -0,0 +1,2331 @@ +/* $NetBSD: ffs_snapshot.c,v 1.118 2011/10/07 09:35:06 hannken Exp $ */ + +/* + * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. + * + * Further information about snapshots can be obtained from: + * + * Marshall Kirk McKusick http://www.mckusick.com/softdep/ + * 1614 Oxford Street mckusick@mckusick.com + * Berkeley, CA 94709-1608 +1-510-843-9542 + * USA + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 + * + * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.118 2011/10/07 09:35:06 hannken Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_ffs.h" +#include "opt_quota.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +struct snap_info { + kmutex_t si_lock; /* Lock this snapinfo */ + kmutex_t si_snaplock; /* Snapshot vnode common lock */ + lwp_t *si_owner; /* Sanplock owner */ + TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */ + daddr_t *si_snapblklist; /* Snapshot block hints list */ + uint32_t si_gen; /* Incremented on change */ +}; + +#if !defined(FFS_NO_SNAPSHOT) +typedef int (*acctfunc_t) + (struct vnode *, void *, int, int, struct fs *, daddr_t, int); + +static int snapshot_setup(struct mount *, struct vnode *); +static int snapshot_copyfs(struct mount *, struct vnode *, void **); +static int snapshot_expunge(struct mount *, struct vnode *, + struct fs *, daddr_t *, daddr_t **); +static int snapshot_expunge_snap(struct mount *, struct vnode *, + struct fs *, daddr_t); +static int snapshot_writefs(struct mount *, struct vnode *, void *); +static int cgaccount(struct vnode *, int, int *); +static int cgaccount1(int, struct vnode *, void *, int); +static int expunge(struct vnode *, struct inode *, struct fs *, + acctfunc_t, int); +static int indiracct(struct vnode *, struct vnode *, int, daddr_t, + daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int); +static int fullacct(struct vnode *, void *, int, int, struct fs *, + daddr_t, int); +static int snapacct(struct vnode *, void *, int, int, struct fs *, + daddr_t, int); +static int mapacct(struct vnode *, void *, int, int, struct fs *, + daddr_t, int); +#endif /* !defined(FFS_NO_SNAPSHOT) */ + +static int ffs_copyonwrite(void *, struct buf *, bool); +static int snapblkaddr(struct vnode *, daddr_t, daddr_t *); +static int rwfsblk(struct vnode *, int, void *, daddr_t); +static int syncsnap(struct vnode *); +static int wrsnapblk(struct vnode *, void *, daddr_t); +#if !defined(FFS_NO_SNAPSHOT) +static int blocks_in_journal(struct fs *); +#endif + +static inline bool is_active_snapshot(struct snap_info *, struct inode *); +static inline daddr_t db_get(struct inode *, int); +static inline void db_assign(struct inode *, int, daddr_t); +static inline daddr_t ib_get(struct inode *, int); +static inline void ib_assign(struct inode *, int, daddr_t); +static inline daddr_t idb_get(struct inode *, void *, int); +static inline void idb_assign(struct inode *, void *, int, daddr_t); + +#ifdef DEBUG +static int snapdebug = 0; +#endif + +int +ffs_snapshot_init(struct ufsmount *ump) +{ + struct snap_info *si; + + si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP); + if (si == NULL) + return ENOMEM; + + TAILQ_INIT(&si->si_snapshots); + mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE); + mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE); + si->si_owner = NULL; + si->si_gen = 0; + si->si_snapblklist = NULL; + + return 0; +} + +void +ffs_snapshot_fini(struct ufsmount *ump) +{ + struct snap_info *si; + + si = ump->um_snapinfo; + ump->um_snapinfo = NULL; + + KASSERT(TAILQ_EMPTY(&si->si_snapshots)); + mutex_destroy(&si->si_lock); + mutex_destroy(&si->si_snaplock); + KASSERT(si->si_snapblklist == NULL); + kmem_free(si, sizeof(*si)); +} + +/* + * Create a snapshot file and initialize it for the filesystem. + * Vnode is locked on entry and return. + */ +int +ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime) +{ +#if defined(FFS_NO_SNAPSHOT) + return EOPNOTSUPP; +} +#else /* defined(FFS_NO_SNAPSHOT) */ + bool suspended = false; + int error, redo = 0, snaploc; + void *sbbuf = NULL; + daddr_t *snaplist = NULL, snaplistsize = 0; + struct buf *bp, *nbp; + struct fs *copy_fs = NULL; + struct fs *fs = VFSTOUFS(mp)->um_fs; + struct inode *ip = VTOI(vp); + struct lwp *l = curlwp; + struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; + struct timespec ts; + struct timeval starttime; +#ifdef DEBUG + struct timeval endtime; +#endif + struct vnode *devvp = ip->i_devvp; + + /* + * If the vnode already is a snapshot, return. + */ + if ((VTOI(vp)->i_flags & SF_SNAPSHOT)) { + if ((VTOI(vp)->i_flags & SF_SNAPINVAL)) + return EINVAL; + if (ctime) { + ctime->tv_sec = DIP(VTOI(vp), mtime); + ctime->tv_nsec = DIP(VTOI(vp), mtimensec); + } + return 0; + } + /* + * Check for free snapshot slot in the superblock. + */ + for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) + if (fs->fs_snapinum[snaploc] == 0) + break; + if (snaploc == FSMAXSNAP) + return (ENOSPC); + /* + * Prepare the vnode to become a snapshot. + */ + error = snapshot_setup(mp, vp); + if (error) + goto out; + + /* + * Copy all the cylinder group maps. Although the + * filesystem is still active, we hope that only a few + * cylinder groups will change between now and when we + * suspend operations. Thus, we will be able to quickly + * touch up the few cylinder groups that changed during + * the suspension period. + */ + error = cgaccount(vp, 1, NULL); + if (error) + goto out; + + /* + * snapshot is now valid + */ + ip->i_flags &= ~SF_SNAPINVAL; + DIP_ASSIGN(ip, flags, ip->i_flags); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + + /* + * Ensure that the snapshot is completely on disk. + * Since we have marked it as a snapshot it is safe to + * unlock it as no process will be allowed to write to it. + */ + error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); + if (error) + goto out; + VOP_UNLOCK(vp); + /* + * All allocations are done, so we can now suspend the filesystem. + */ + error = vfs_suspend(vp->v_mount, 0); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if (error) + goto out; + suspended = true; + getmicrotime(&starttime); + /* + * First, copy all the cylinder group maps that have changed. + */ + error = cgaccount(vp, 2, &redo); + if (error) + goto out; + /* + * Create a copy of the superblock and its summary information. + */ + error = snapshot_copyfs(mp, vp, &sbbuf); + copy_fs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc)); + if (error) + goto out; + /* + * Expunge unlinked files from our view. + */ + error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist); + if (error) + goto out; + /* + * Record snapshot inode. Since this is the newest snapshot, + * it must be placed at the end of the list. + */ + if (ip->i_nlink > 0) + fs->fs_snapinum[snaploc] = ip->i_number; + + mutex_enter(&si->si_lock); + if (is_active_snapshot(si, ip)) + panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number); + TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); + if (TAILQ_FIRST(&si->si_snapshots) == ip) { + /* + * If this is the first snapshot on this filesystem, put the + * preliminary list in place and establish the cow handler. + */ + si->si_snapblklist = snaplist; + fscow_establish(mp, ffs_copyonwrite, devvp); + } + si->si_gen++; + mutex_exit(&si->si_lock); + + vp->v_vflag |= VV_SYSTEM; + /* + * Set the mtime to the time the snapshot has been taken. + */ + TIMEVAL_TO_TIMESPEC(&starttime, &ts); + if (ctime) + *ctime = ts; + DIP_ASSIGN(ip, mtime, ts.tv_sec); + DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * Copy allocation information from all snapshots and then + * expunge them from our view. + */ + error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize); + if (error) + goto out; + /* + * Write the superblock and its summary information to the snapshot. + */ + error = snapshot_writefs(mp, vp, sbbuf); + if (error) + goto out; + /* + * We're nearly done, ensure that the snapshot is completely on disk. + */ + error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); + if (error) + goto out; + /* + * Invalidate and free all pages on the snapshot vnode. + * We will read and write through the buffercache. + */ + mutex_enter(vp->v_interlock); + error = VOP_PUTPAGES(vp, 0, 0, + PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE); + if (error) + goto out; + /* + * Invalidate short ( < fs_bsize ) buffers. We will always read + * full size buffers later. + */ + mutex_enter(&bufcache_lock); + KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); + for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); + KASSERT((bp->b_cflags & BC_BUSY) == 0); + if (bp->b_bcount < fs->fs_bsize) { + bp->b_cflags |= BC_BUSY; + brelsel(bp, BC_INVAL | BC_VFLUSH); + } + } + mutex_exit(&bufcache_lock); + +out: + if (sbbuf != NULL) { + free(copy_fs->fs_csp, M_UFSMNT); + free(sbbuf, M_UFSMNT); + } + if (fs->fs_active != NULL) { + free(fs->fs_active, M_DEVBUF); + fs->fs_active = NULL; + } + + mutex_enter(&si->si_lock); + if (snaplist != NULL) { + if (si->si_snapblklist == snaplist) + si->si_snapblklist = NULL; + free(snaplist, M_UFSMNT); + } + if (error) { + fs->fs_snapinum[snaploc] = 0; + } else { + /* + * As this is the newest list, it is the most inclusive, so + * should replace the previous list. + */ + si->si_snapblklist = ip->i_snapblklist; + } + si->si_gen++; + mutex_exit(&si->si_lock); + + if (suspended) { + VOP_UNLOCK(vp); + vfs_resume(vp->v_mount); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); +#ifdef DEBUG + getmicrotime(&endtime); + timersub(&endtime, &starttime, &endtime); + printf("%s: suspended %lld.%03d sec, redo %d of %d\n", + mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec, + endtime.tv_usec / 1000, redo, fs->fs_ncg); +#endif + } + if (error) { + if (!UFS_WAPBL_BEGIN(mp)) { + (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); + UFS_WAPBL_END(mp); + } + } else if (ip->i_nlink > 0) + vref(vp); + return (error); +} + +/* + * Prepare vnode to become a snapshot. + */ +static int +snapshot_setup(struct mount *mp, struct vnode *vp) +{ + int error, n, len, loc, cg; + daddr_t blkno, numblks; + struct buf *ibp, *nbp; + struct fs *fs = VFSTOUFS(mp)->um_fs; + struct lwp *l = curlwp; + const int wbreak = blocks_in_journal(fs)/8; + struct inode *ip = VTOI(vp); + + /* + * Check mount, exclusive reference and owner. + */ + if (vp->v_mount != mp) + return EXDEV; + if (vp->v_usecount != 1 || vp->v_writecount != 0) + return EBUSY; + if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, + NULL) != 0 && + VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred)) + return EACCES; + + if (vp->v_size != 0) { + error = ffs_truncate(vp, 0, 0, NOCRED); + if (error) + return error; + } + + /* Change inode to snapshot type file. */ + error = UFS_WAPBL_BEGIN(mp); + if (error) + return error; +#if defined(QUOTA) || defined(QUOTA2) + /* shapshot inodes are not accounted in quotas */ + chkiq(ip, -1, l->l_cred, 0); +#endif + ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL); + DIP_ASSIGN(ip, flags, ip->i_flags); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + ffs_update(vp, NULL, NULL, UPDATE_WAIT); + UFS_WAPBL_END(mp); + + KASSERT(ip->i_flags & SF_SNAPSHOT); + /* + * Write an empty list of preallocated blocks to the end of + * the snapshot to set size to at least that of the filesystem. + */ + numblks = howmany(fs->fs_size, fs->fs_frag); + blkno = 1; + blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs)); + error = vn_rdwr(UIO_WRITE, vp, + (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks), + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); + if (error) + return error; + /* + * Preallocate critical data structures so that we can copy + * them in without further allocation after we suspend all + * operations on the filesystem. We would like to just release + * the allocated buffers without writing them since they will + * be filled in below once we are ready to go, but this upsets + * the soft update code, so we go ahead and write the new buffers. + * + * Allocate all indirect blocks and mark all of them as not + * needing to be copied. + */ + error = UFS_WAPBL_BEGIN(mp); + if (error) + return error; + for (blkno = NDADDR, n = 0; blkno < numblks; blkno += NINDIR(fs)) { + error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), + fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); + if (error) + goto out; + brelse(ibp, 0); + if (wbreak > 0 && (++n % wbreak) == 0) { + UFS_WAPBL_END(mp); + error = UFS_WAPBL_BEGIN(mp); + if (error) + return error; + } + } + /* + * Allocate copies for the superblock and its summary information. + */ + error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred, + 0, &nbp); + if (error) + goto out; + bawrite(nbp); + blkno = fragstoblks(fs, fs->fs_csaddr); + len = howmany(fs->fs_cssize, fs->fs_bsize); + for (loc = 0; loc < len; loc++) { + error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)), + fs->fs_bsize, l->l_cred, 0, &nbp); + if (error) + goto out; + bawrite(nbp); + if (wbreak > 0 && (++n % wbreak) == 0) { + UFS_WAPBL_END(mp); + error = UFS_WAPBL_BEGIN(mp); + if (error) + return error; + } + } + /* + * Allocate all cylinder group blocks. + */ + for (cg = 0; cg < fs->fs_ncg; cg++) { + error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), + fs->fs_bsize, l->l_cred, 0, &nbp); + if (error) + goto out; + bawrite(nbp); + if (wbreak > 0 && (++n % wbreak) == 0) { + UFS_WAPBL_END(mp); + error = UFS_WAPBL_BEGIN(mp); + if (error) + return error; + } + } + +out: + UFS_WAPBL_END(mp); + return error; +} + +/* + * Create a copy of the superblock and its summary information. + * It is up to the caller to free copyfs and copy_fs->fs_csp. + */ +static int +snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf) +{ + int error, i, len, loc, size; + void *space; + int32_t *lp; + struct buf *bp; + struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; + struct lwp *l = curlwp; + struct vnode *devvp = VTOI(vp)->i_devvp; + + /* + * Grab a copy of the superblock and its summary information. + * We delay writing it until the suspension is released below. + */ + *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); + loc = blkoff(fs, fs->fs_sblockloc); + if (loc > 0) + memset(*sbbuf, 0, loc); + copyfs = (struct fs *)((char *)(*sbbuf) + loc); + memcpy(copyfs, fs, fs->fs_sbsize); + size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; + if (fs->fs_sbsize < size) + memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0, + size - fs->fs_sbsize); + size = blkroundup(fs, fs->fs_cssize); + if (fs->fs_contigsumsize > 0) + size += fs->fs_ncg * sizeof(int32_t); + space = malloc(size, M_UFSMNT, M_WAITOK); + copyfs->fs_csp = space; + memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize); + space = (char *)space + fs->fs_cssize; + loc = howmany(fs->fs_cssize, fs->fs_fsize); + i = fs->fs_frag - loc % fs->fs_frag; + len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; + if (len > 0) { + if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), + len, l->l_cred, 0, &bp)) != 0) { + brelse(bp, 0); + free(copyfs->fs_csp, M_UFSMNT); + free(*sbbuf, M_UFSMNT); + *sbbuf = NULL; + return error; + } + memcpy(space, bp->b_data, (u_int)len); + space = (char *)space + len; + brelse(bp, BC_INVAL | BC_NOCACHE); + } + if (fs->fs_contigsumsize > 0) { + copyfs->fs_maxcluster = lp = space; + for (i = 0; i < fs->fs_ncg; i++) + *lp++ = fs->fs_contigsumsize; + } + if (mp->mnt_wapbl) + copyfs->fs_flags &= ~FS_DOWAPBL; + return 0; +} + +/* + * We must check for active files that have been unlinked (e.g., with a zero + * link count). We have to expunge all trace of these files from the snapshot + * so that they are not reclaimed prematurely by fsck or unnecessarily dumped. + * Note that we skip unlinked snapshot files as they will be handled separately. + * Calculate the snapshot list size and create a preliminary list. + */ +static int +snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs, + daddr_t *snaplistsize, daddr_t **snaplist) +{ + int cg, error = 0, len, loc; + daddr_t blkno, *blkp; + struct fs *fs = VFSTOUFS(mp)->um_fs; + struct inode *xp; + struct lwp *l = curlwp; + struct vattr vat; + struct vnode *logvp = NULL, *mvp = NULL, *xvp; + + *snaplist = NULL; + /* + * Get the log inode if any. + */ + if ((fs->fs_flags & FS_DOWAPBL) && + fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { + error = VFS_VGET(mp, + fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp); + if (error) + goto out; + } + /* + * Allocate a marker vnode. + */ + mvp = vnalloc(mp); + /* + * We also calculate the needed size for the snapshot list. + */ + *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + + FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; + mutex_enter(&mntvnode_lock); + /* + * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() + * and vclean() can be called indirectly + */ + for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) { + vmark(mvp, xvp); + /* + * Make sure this vnode wasn't reclaimed in getnewvnode(). + * Start over if it has (it won't be on the list anymore). + */ + if (xvp->v_mount != mp || vismarker(xvp)) + continue; + mutex_enter(xvp->v_interlock); + if ((xvp->v_iflag & VI_XLOCK) || + xvp->v_usecount == 0 || xvp->v_type == VNON || + VTOI(xvp) == NULL || + (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { + mutex_exit(xvp->v_interlock); + continue; + } + mutex_exit(&mntvnode_lock); + /* + * XXXAD should increase vnode ref count to prevent it + * disappearing or being recycled. + */ + mutex_exit(xvp->v_interlock); +#ifdef DEBUG + if (snapdebug) + vprint("ffs_snapshot: busy vnode", xvp); +#endif + xp = VTOI(xvp); + if (xvp != logvp) { + if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 && + vat.va_nlink > 0) { + mutex_enter(&mntvnode_lock); + continue; + } + if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { + mutex_enter(&mntvnode_lock); + continue; + } + } + /* + * If there is a fragment, clear it here. + */ + blkno = 0; + loc = howmany(xp->i_size, fs->fs_bsize) - 1; + if (loc < NDADDR) { + len = fragroundup(fs, blkoff(fs, xp->i_size)); + if (len > 0 && len < fs->fs_bsize) { + error = UFS_WAPBL_BEGIN(mp); + if (error) { + (void)vunmark(mvp); + goto out; + } + ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc), + len, xp->i_number); + blkno = db_get(xp, loc); + db_assign(xp, loc, 0); + UFS_WAPBL_END(mp); + } + } + *snaplistsize += 1; + error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY); + if (blkno) + db_assign(xp, loc, blkno); + if (!error) { + error = UFS_WAPBL_BEGIN(mp); + if (!error) { + error = ffs_freefile_snap(copy_fs, vp, + xp->i_number, xp->i_mode); + UFS_WAPBL_END(mp); + } + } + if (error) { + (void)vunmark(mvp); + goto out; + } + mutex_enter(&mntvnode_lock); + } + mutex_exit(&mntvnode_lock); + /* + * Create a preliminary list of preallocated snapshot blocks. + */ + *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); + blkp = &(*snaplist)[1]; + *blkp++ = lblkno(fs, fs->fs_sblockloc); + blkno = fragstoblks(fs, fs->fs_csaddr); + for (cg = 0; cg < fs->fs_ncg; cg++) { + if (fragstoblks(fs, cgtod(fs, cg)) > blkno) + break; + *blkp++ = fragstoblks(fs, cgtod(fs, cg)); + } + len = howmany(fs->fs_cssize, fs->fs_bsize); + for (loc = 0; loc < len; loc++) + *blkp++ = blkno + loc; + for (; cg < fs->fs_ncg; cg++) + *blkp++ = fragstoblks(fs, cgtod(fs, cg)); + (*snaplist)[0] = blkp - &(*snaplist)[0]; + +out: + if (mvp != NULL) + vnfree(mvp); + if (logvp != NULL) + vput(logvp); + if (error && *snaplist != NULL) { + free(*snaplist, M_UFSMNT); + *snaplist = NULL; + } + + return error; +} + +/* + * Copy allocation information from all the snapshots in this snapshot and + * then expunge them from its view. Also, collect the list of allocated + * blocks in i_snapblklist. + */ +static int +snapshot_expunge_snap(struct mount *mp, struct vnode *vp, + struct fs *copy_fs, daddr_t snaplistsize) +{ + int error = 0, i; + daddr_t numblks, *snaplist = NULL; + struct fs *fs = VFSTOUFS(mp)->um_fs; + struct inode *ip = VTOI(vp), *xp; + struct lwp *l = curlwp; + struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; + + TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) { + if (xp != ip) { + error = expunge(vp, xp, fs, snapacct, BLK_SNAP); + if (error) + break; + } + if (xp->i_nlink != 0) + continue; + error = UFS_WAPBL_BEGIN(mp); + if (error) + break; + error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode); + UFS_WAPBL_END(mp); + if (error) + break; + } + if (error) + goto out; + /* + * Allocate space for the full list of preallocated snapshot blocks. + */ + snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); + ip->i_snapblklist = &snaplist[1]; + /* + * Expunge the blocks used by the snapshots from the set of + * blocks marked as used in the snapshot bitmaps. Also, collect + * the list of allocated blocks in i_snapblklist. + */ + error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP); + if (error) + goto out; + if (snaplistsize < ip->i_snapblklist - snaplist) + panic("ffs_snapshot: list too small"); + snaplistsize = ip->i_snapblklist - snaplist; + snaplist[0] = snaplistsize; + ip->i_snapblklist = &snaplist[0]; + /* + * Write out the list of allocated blocks to the end of the snapshot. + */ + numblks = howmany(fs->fs_size, fs->fs_frag); + for (i = 0; i < snaplistsize; i++) + snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); + error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist, + snaplistsize * sizeof(daddr_t), lblktosize(fs, (off_t)numblks), + UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL); + for (i = 0; i < snaplistsize; i++) + snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); +out: + if (error && snaplist != NULL) { + free(snaplist, M_UFSMNT); + ip->i_snapblklist = NULL; + } + return error; +} + +/* + * Write the superblock and its summary information to the snapshot. + * Make sure, the first NDADDR blocks get copied to the snapshot. + */ +static int +snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf) +{ + int error, len, loc; + void *space; + daddr_t blkno; + struct buf *bp; + struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; + struct inode *ip = VTOI(vp); + struct lwp *l = curlwp; + + copyfs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc)); + + /* + * Write the superblock and its summary information + * to the snapshot. + */ + blkno = fragstoblks(fs, fs->fs_csaddr); + len = howmany(fs->fs_cssize, fs->fs_bsize); + space = copyfs->fs_csp; +#ifdef FFS_EI + if (UFS_FSNEEDSWAP(fs)) { + ffs_sb_swap(copyfs, copyfs); + ffs_csum_swap(space, space, fs->fs_cssize); + } +#endif + error = UFS_WAPBL_BEGIN(mp); + if (error) + return error; + for (loc = 0; loc < len; loc++) { + error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred, + B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + break; + } + memcpy(bp->b_data, space, fs->fs_bsize); + space = (char *)space + fs->fs_bsize; + bawrite(bp); + } + if (error) + goto out; + error = bread(vp, lblkno(fs, fs->fs_sblockloc), + fs->fs_bsize, l->l_cred, B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + goto out; + } else { + memcpy(bp->b_data, sbbuf, fs->fs_bsize); + bawrite(bp); + } + /* + * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite() + * and ffs_snapblkfree() will always work on indirect blocks. + */ + for (loc = 0; loc < NDADDR; loc++) { + if (db_get(ip, loc) != 0) + continue; + error = ffs_balloc(vp, lblktosize(fs, (off_t)loc), + fs->fs_bsize, l->l_cred, 0, &bp); + if (error) + break; + error = rwfsblk(vp, B_READ, bp->b_data, loc); + if (error) { + brelse(bp, 0); + break; + } + bawrite(bp); + } + +out: + UFS_WAPBL_END(mp); + return error; +} + +/* + * Copy all cylinder group maps. + */ +static int +cgaccount(struct vnode *vp, int passno, int *redo) +{ + int cg, error = 0; + struct buf *nbp; + struct fs *fs = VTOI(vp)->i_fs; + + if (redo != NULL) + *redo = 0; + if (passno == 1) + fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY), + M_DEVBUF, M_WAITOK | M_ZERO); + for (cg = 0; cg < fs->fs_ncg; cg++) { + if (passno == 2 && ACTIVECG_ISSET(fs, cg)) + continue; + + if (redo != NULL) + *redo += 1; + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + return error; + error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), + fs->fs_bsize, curlwp->l_cred, 0, &nbp); + if (error) { + UFS_WAPBL_END(vp->v_mount); + break; + } + error = cgaccount1(cg, vp, nbp->b_data, passno); + bawrite(nbp); + UFS_WAPBL_END(vp->v_mount); + if (error) + break; + } + return error; +} + +/* + * Copy a cylinder group map. All the unallocated blocks are marked + * BLK_NOCOPY so that the snapshot knows that it need not copy them + * if they are later written. If passno is one, then this is a first + * pass, so only setting needs to be done. If passno is 2, then this + * is a revision to a previous pass which must be undone as the + * replacement pass is done. + */ +static int +cgaccount1(int cg, struct vnode *vp, void *data, int passno) +{ + struct buf *bp, *ibp; + struct inode *ip; + struct cg *cgp; + struct fs *fs; + struct lwp *l = curlwp; + daddr_t base, numblks; + int error, len, loc, ns, indiroff; + + ip = VTOI(vp); + fs = ip->i_fs; + ns = UFS_FSNEEDSWAP(fs); + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, l->l_cred, 0, &bp); + if (error) { + brelse(bp, 0); + return (error); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp, ns)) { + brelse(bp, 0); + return (EIO); + } + ACTIVECG_SET(fs, cg); + + memcpy(data, bp->b_data, fs->fs_cgsize); + brelse(bp, 0); + if (fs->fs_cgsize < fs->fs_bsize) + memset((char *)data + fs->fs_cgsize, 0, + fs->fs_bsize - fs->fs_cgsize); + numblks = howmany(fs->fs_size, fs->fs_frag); + len = howmany(fs->fs_fpg, fs->fs_frag); + base = cg * fs->fs_fpg / fs->fs_frag; + if (base + len >= numblks) + len = numblks - base - 1; + loc = 0; + if (base < NDADDR) { + for ( ; loc < NDADDR; loc++) { + if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) + db_assign(ip, loc, BLK_NOCOPY); + else if (db_get(ip, loc) == BLK_NOCOPY) { + if (passno == 2) + db_assign(ip, loc, 0); + else if (passno == 1) + panic("ffs_snapshot: lost direct block"); + } + } + } + if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)), + fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) + return (error); + indiroff = (base + loc - NDADDR) % NINDIR(fs); + for ( ; loc < len; loc++, indiroff++) { + if (indiroff >= NINDIR(fs)) { + bawrite(ibp); + if ((error = ffs_balloc(vp, + lblktosize(fs, (off_t)(base + loc)), + fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) + return (error); + indiroff = 0; + } + if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) + idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); + else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { + if (passno == 2) + idb_assign(ip, ibp->b_data, indiroff, 0); + else if (passno == 1) + panic("ffs_snapshot: lost indirect block"); + } + } + bdwrite(ibp); + return (0); +} + +/* + * Before expunging a snapshot inode, note all the + * blocks that it claims with BLK_SNAP so that fsck will + * be able to account for those blocks properly and so + * that this snapshot knows that it need not copy them + * if the other snapshot holding them is freed. + */ +static int +expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, + acctfunc_t acctfunc, int expungetype) +{ + int i, error, ns; + daddr_t lbn, rlbn; + daddr_t len, blkno, numblks, blksperindir; + struct ufs1_dinode *dip1; + struct ufs2_dinode *dip2; + struct lwp *l = curlwp; + void *bap; + struct buf *bp; + struct mount *mp; + + ns = UFS_FSNEEDSWAP(fs); + mp = snapvp->v_mount; + + error = UFS_WAPBL_BEGIN(mp); + if (error) + return error; + /* + * Prepare to expunge the inode. If its inode block has not + * yet been copied, then allocate and fill the copy. + */ + lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); + error = snapblkaddr(snapvp, lbn, &blkno); + if (error) + return error; + if (blkno != 0) { + error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred, + B_MODIFY, &bp); + } else { + error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, l->l_cred, 0, &bp); + if (! error) + error = rwfsblk(snapvp, B_READ, bp->b_data, lbn); + } + if (error) { + UFS_WAPBL_END(mp); + return error; + } + /* + * Set a snapshot inode to be a zero length file, regular files + * or unlinked snapshots to be completely unallocated. + */ + if (fs->fs_magic == FS_UFS1_MAGIC) { + dip1 = (struct ufs1_dinode *)bp->b_data + + ino_to_fsbo(fs, cancelip->i_number); + if (cancelip->i_flags & SF_SNAPSHOT) { + dip1->di_flags = + ufs_rw32(ufs_rw32(dip1->di_flags, ns) | + SF_SNAPINVAL, ns); + } + if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0) + dip1->di_mode = 0; + dip1->di_size = 0; + dip1->di_blocks = 0; + memset(&dip1->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int32_t)); + } else { + dip2 = (struct ufs2_dinode *)bp->b_data + + ino_to_fsbo(fs, cancelip->i_number); + if (cancelip->i_flags & SF_SNAPSHOT) { + dip2->di_flags = + ufs_rw32(ufs_rw32(dip2->di_flags, ns) | + SF_SNAPINVAL, ns); + } + if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0) + dip2->di_mode = 0; + dip2->di_size = 0; + dip2->di_blocks = 0; + memset(&dip2->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int64_t)); + } + bdwrite(bp); + UFS_WAPBL_END(mp); + /* + * Now go through and expunge all the blocks in the file + * using the function requested. + */ + numblks = howmany(cancelip->i_size, fs->fs_bsize); + if (fs->fs_magic == FS_UFS1_MAGIC) + bap = &cancelip->i_ffs1_db[0]; + else + bap = &cancelip->i_ffs2_db[0]; + error = (*acctfunc)(snapvp, bap, 0, NDADDR, fs, 0, expungetype); + if (error) + return (error); + if (fs->fs_magic == FS_UFS1_MAGIC) + bap = &cancelip->i_ffs1_ib[0]; + else + bap = &cancelip->i_ffs2_ib[0]; + error = (*acctfunc)(snapvp, bap, 0, NIADDR, fs, -1, expungetype); + if (error) + return (error); + blksperindir = 1; + lbn = -NDADDR; + len = numblks - NDADDR; + rlbn = NDADDR; + for (i = 0; len > 0 && i < NIADDR; i++) { + error = indiracct(snapvp, ITOV(cancelip), i, + ib_get(cancelip, i), lbn, rlbn, len, + blksperindir, fs, acctfunc, expungetype); + if (error) + return (error); + blksperindir *= NINDIR(fs); + lbn -= blksperindir + 1; + len -= blksperindir; + rlbn += blksperindir; + } + return (0); +} + +/* + * Descend an indirect block chain for vnode cancelvp accounting for all + * its indirect blocks in snapvp. + */ +static int +indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level, + daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks, + daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype) +{ + int error, num, i; + daddr_t subblksperindir; + struct indir indirs[NIADDR + 2]; + daddr_t last; + void *bap; + struct buf *bp; + + if (blkno == 0) { + if (expungetype == BLK_NOCOPY) + return (0); + panic("indiracct: missing indir"); + } + if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) + return (error); + if (lbn != indirs[num - 1 - level].in_lbn || num < 2) + panic("indiracct: botched params"); + /* + * We have to expand bread here since it will deadlock looking + * up the block number for any blocks that are not in the cache. + */ + error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize, + false, &bp); + if (error) + return error; + if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error = + rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) { + brelse(bp, 0); + return (error); + } + /* + * Account for the block pointers in this indirect block. + */ + last = howmany(remblks, blksperindir); + if (last > NINDIR(fs)) + last = NINDIR(fs); + bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO); + memcpy((void *)bap, bp->b_data, fs->fs_bsize); + brelse(bp, 0); + error = (*acctfunc)(snapvp, bap, 0, last, + fs, level == 0 ? rlbn : -1, expungetype); + if (error || level == 0) + goto out; + /* + * Account for the block pointers in each of the indirect blocks + * in the levels below us. + */ + subblksperindir = blksperindir / NINDIR(fs); + for (lbn++, level--, i = 0; i < last; i++) { + error = indiracct(snapvp, cancelvp, level, + idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks, + subblksperindir, fs, acctfunc, expungetype); + if (error) + goto out; + rlbn += blksperindir; + lbn -= blksperindir; + remblks -= blksperindir; + } +out: + free(bap, M_DEVBUF); + return (error); +} + +/* + * Do both snap accounting and map accounting. + */ +static int +fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, + struct fs *fs, daddr_t lblkno, + int exptype /* BLK_SNAP or BLK_NOCOPY */) +{ + int error; + + if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype))) + return (error); + return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)); +} + +/* + * Identify a set of blocks allocated in a snapshot inode. + */ +static int +snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, + struct fs *fs, daddr_t lblkno, + int expungetype /* BLK_SNAP or BLK_NOCOPY */) +{ + struct inode *ip = VTOI(vp); + struct lwp *l = curlwp; + struct mount *mp = vp->v_mount; + daddr_t blkno; + daddr_t lbn; + struct buf *ibp; + int error, n; + const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8; + + error = UFS_WAPBL_BEGIN(mp); + if (error) + return error; + for ( n = 0; oldblkp < lastblkp; oldblkp++) { + blkno = idb_get(ip, bap, oldblkp); + if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) + continue; + lbn = fragstoblks(fs, blkno); + if (lbn < NDADDR) { + blkno = db_get(ip, lbn); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } else { + error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); + if (error) + break; + blkno = idb_get(ip, ibp->b_data, + (lbn - NDADDR) % NINDIR(fs)); + } + /* + * If we are expunging a snapshot vnode and we + * find a block marked BLK_NOCOPY, then it is + * one that has been allocated to this snapshot after + * we took our current snapshot and can be ignored. + */ + if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { + if (lbn >= NDADDR) + brelse(ibp, 0); + } else { + if (blkno != 0) + panic("snapacct: bad block"); + if (lbn < NDADDR) + db_assign(ip, lbn, expungetype); + else { + idb_assign(ip, ibp->b_data, + (lbn - NDADDR) % NINDIR(fs), expungetype); + bdwrite(ibp); + } + } + if (wbreak > 0 && (++n % wbreak) == 0) { + UFS_WAPBL_END(mp); + error = UFS_WAPBL_BEGIN(mp); + if (error) + return error; + } + } + UFS_WAPBL_END(mp); + return error; +} + +/* + * Account for a set of blocks allocated in a snapshot inode. + */ +static int +mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, + struct fs *fs, daddr_t lblkno, int expungetype) +{ + daddr_t blkno; + struct inode *ip; + struct mount *mp = vp->v_mount; + ino_t inum; + int acctit, error, n; + const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8; + + error = UFS_WAPBL_BEGIN(mp); + if (error) + return error; + ip = VTOI(vp); + inum = ip->i_number; + if (lblkno == -1) + acctit = 0; + else + acctit = 1; + for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) { + blkno = idb_get(ip, bap, oldblkp); + if (blkno == 0 || blkno == BLK_NOCOPY) + continue; + if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) + *ip->i_snapblklist++ = lblkno; + if (blkno == BLK_SNAP) + blkno = blkstofrags(fs, lblkno); + ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum); + if (wbreak > 0 && (++n % wbreak) == 0) { + UFS_WAPBL_END(mp); + error = UFS_WAPBL_BEGIN(mp); + if (error) + return error; + } + } + UFS_WAPBL_END(mp); + return (0); +} + +/* + * Number of blocks that fit into the journal or zero if not logging. + */ +static int +blocks_in_journal(struct fs *fs) +{ + off_t bpj; + + if ((fs->fs_flags & FS_DOWAPBL) == 0) + return 0; + bpj = 1; + if (fs->fs_journal_version == UFS_WAPBL_VERSION) { + switch (fs->fs_journal_location) { + case UFS_WAPBL_JOURNALLOC_END_PARTITION: + bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]* + fs->fs_journallocs[UFS_WAPBL_EPART_COUNT]; + break; + case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: + bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]* + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; + break; + } + } + bpj /= fs->fs_bsize; + return (bpj > 0 ? bpj : 1); +} +#endif /* defined(FFS_NO_SNAPSHOT) */ + +/* + * Decrement extra reference on snapshot when last name is removed. + * It will not be freed until the last open reference goes away. + */ +void +ffs_snapgone(struct inode *ip) +{ + struct mount *mp = ip->i_devvp->v_specmountpoint; + struct inode *xp; + struct fs *fs; + struct snap_info *si; + int snaploc; + + si = VFSTOUFS(mp)->um_snapinfo; + + /* + * Find snapshot in incore list. + */ + mutex_enter(&si->si_lock); + TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) + if (xp == ip) + break; + mutex_exit(&si->si_lock); + if (xp != NULL) + vrele(ITOV(ip)); +#ifdef DEBUG + else if (snapdebug) + printf("ffs_snapgone: lost snapshot vnode %llu\n", + (unsigned long long)ip->i_number); +#endif + /* + * Delete snapshot inode from superblock. Keep list dense. + */ + mutex_enter(&si->si_lock); + fs = ip->i_fs; + for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) + if (fs->fs_snapinum[snaploc] == ip->i_number) + break; + if (snaploc < FSMAXSNAP) { + for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { + if (fs->fs_snapinum[snaploc] == 0) + break; + fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; + } + fs->fs_snapinum[snaploc - 1] = 0; + } + si->si_gen++; + mutex_exit(&si->si_lock); +} + +/* + * Prepare a snapshot file for being removed. + */ +void +ffs_snapremove(struct vnode *vp) +{ + struct inode *ip = VTOI(vp), *xp; + struct vnode *devvp = ip->i_devvp; + struct fs *fs = ip->i_fs; + struct mount *mp = devvp->v_specmountpoint; + struct buf *ibp; + struct snap_info *si; + struct lwp *l = curlwp; + daddr_t numblks, blkno, dblk; + int error, loc, last; + + si = VFSTOUFS(mp)->um_snapinfo; + /* + * If active, delete from incore list (this snapshot may + * already have been in the process of being deleted, so + * would not have been active). + * + * Clear copy-on-write flag if last snapshot. + */ + mutex_enter(&si->si_snaplock); + mutex_enter(&si->si_lock); + if (is_active_snapshot(si, ip)) { + TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap); + if (TAILQ_FIRST(&si->si_snapshots) != 0) { + /* Roll back the list of preallocated blocks. */ + xp = TAILQ_LAST(&si->si_snapshots, inodelst); + si->si_snapblklist = xp->i_snapblklist; + si->si_gen++; + mutex_exit(&si->si_lock); + mutex_exit(&si->si_snaplock); + } else { + si->si_snapblklist = 0; + si->si_gen++; + mutex_exit(&si->si_lock); + mutex_exit(&si->si_snaplock); + fscow_disestablish(mp, ffs_copyonwrite, devvp); + } + if (ip->i_snapblklist != NULL) { + free(ip->i_snapblklist, M_UFSMNT); + ip->i_snapblklist = NULL; + } + } else { + mutex_exit(&si->si_lock); + mutex_exit(&si->si_snaplock); + } + /* + * Clear all BLK_NOCOPY fields. Pass any block claims to other + * snapshots that want them (see ffs_snapblkfree below). + */ + for (blkno = 1; blkno < NDADDR; blkno++) { + dblk = db_get(ip, blkno); + if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) + db_assign(ip, blkno, 0); + else if ((dblk == blkstofrags(fs, blkno) && + ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, + ip->i_number))) { + DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); + db_assign(ip, blkno, 0); + } + } + numblks = howmany(ip->i_size, fs->fs_bsize); + for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { + error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), + fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); + if (error) + continue; + if (fs->fs_size - blkno > NINDIR(fs)) + last = NINDIR(fs); + else + last = fs->fs_size - blkno; + for (loc = 0; loc < last; loc++) { + dblk = idb_get(ip, ibp->b_data, loc); + if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) + idb_assign(ip, ibp->b_data, loc, 0); + else if (dblk == blkstofrags(fs, blkno) && + ffs_snapblkfree(fs, ip->i_devvp, dblk, + fs->fs_bsize, ip->i_number)) { + DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); + idb_assign(ip, ibp->b_data, loc, 0); + } + } + bawrite(ibp); + UFS_WAPBL_END(mp); + error = UFS_WAPBL_BEGIN(mp); + KASSERT(error == 0); + } + /* + * Clear snapshot flag and drop reference. + */ + ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL); + DIP_ASSIGN(ip, flags, ip->i_flags); + ip->i_flag |= IN_CHANGE | IN_UPDATE; +#if defined(QUOTA) || defined(QUOTA2) + chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE); + chkiq(ip, 1, l->l_cred, FORCE); +#endif +} + +/* + * Notification that a block is being freed. Return zero if the free + * should be allowed to proceed. Return non-zero if the snapshot file + * wants to claim the block. The block will be claimed if it is an + * uncopied part of one of the snapshots. It will be freed if it is + * either a BLK_NOCOPY or has already been copied in all of the snapshots. + * If a fragment is being freed, then all snapshots that care about + * it must make a copy since a snapshot file can only claim full sized + * blocks. Note that if more than one snapshot file maps the block, + * we can pick one at random to claim it. Since none of the snapshots + * can change, we are assurred that they will all see the same unmodified + * image. When deleting a snapshot file (see ffs_snapremove above), we + * must push any of these claimed blocks to one of the other snapshots + * that maps it. These claimed blocks are easily identified as they will + * have a block number equal to their logical block number within the + * snapshot. A copied block can never have this property because they + * must always have been allocated from a BLK_NOCOPY location. + */ +int +ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, + long size, ino_t inum) +{ + struct mount *mp = devvp->v_specmountpoint; + struct buf *ibp; + struct inode *ip; + struct vnode *vp = NULL; + struct snap_info *si; + void *saved_data = NULL; + daddr_t lbn; + daddr_t blkno; + uint32_t gen; + int indiroff = 0, error = 0, claimedblk = 0; + + si = VFSTOUFS(mp)->um_snapinfo; + lbn = fragstoblks(fs, bno); + mutex_enter(&si->si_snaplock); + mutex_enter(&si->si_lock); + si->si_owner = curlwp; + +retry: + gen = si->si_gen; + TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { + vp = ITOV(ip); + /* + * Lookup block being written. + */ + if (lbn < NDADDR) { + blkno = db_get(ip, lbn); + } else { + mutex_exit(&si->si_lock); + error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, FSCRED, B_METAONLY, &ibp); + if (error) { + mutex_enter(&si->si_lock); + break; + } + indiroff = (lbn - NDADDR) % NINDIR(fs); + blkno = idb_get(ip, ibp->b_data, indiroff); + mutex_enter(&si->si_lock); + if (gen != si->si_gen) { + brelse(ibp, 0); + goto retry; + } + } + /* + * Check to see if block needs to be copied. + */ + if (blkno == 0) { + /* + * A block that we map is being freed. If it has not + * been claimed yet, we will claim or copy it (below). + */ + claimedblk = 1; + } else if (blkno == BLK_SNAP) { + /* + * No previous snapshot claimed the block, + * so it will be freed and become a BLK_NOCOPY + * (don't care) for us. + */ + if (claimedblk) + panic("snapblkfree: inconsistent block type"); + if (lbn < NDADDR) { + db_assign(ip, lbn, BLK_NOCOPY); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } else { + idb_assign(ip, ibp->b_data, indiroff, + BLK_NOCOPY); + mutex_exit(&si->si_lock); + if (ip->i_nlink > 0) + bwrite(ibp); + else + bdwrite(ibp); + mutex_enter(&si->si_lock); + if (gen != si->si_gen) + goto retry; + } + continue; + } else /* BLK_NOCOPY or default */ { + /* + * If the snapshot has already copied the block + * (default), or does not care about the block, + * it is not needed. + */ + if (lbn >= NDADDR) + brelse(ibp, 0); + continue; + } + /* + * If this is a full size block, we will just grab it + * and assign it to the snapshot inode. Otherwise we + * will proceed to copy it. See explanation for this + * routine as to why only a single snapshot needs to + * claim this block. + */ + if (size == fs->fs_bsize) { +#ifdef DEBUG + if (snapdebug) + printf("%s %llu lbn %" PRId64 + "from inum %llu\n", + "Grabonremove: snapino", + (unsigned long long)ip->i_number, + lbn, (unsigned long long)inum); +#endif + mutex_exit(&si->si_lock); + if (lbn < NDADDR) { + db_assign(ip, lbn, bno); + } else { + idb_assign(ip, ibp->b_data, indiroff, bno); + if (ip->i_nlink > 0) + bwrite(ibp); + else + bdwrite(ibp); + } + DIP_ADD(ip, blocks, btodb(size)); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (ip->i_nlink > 0 && mp->mnt_wapbl) + error = syncsnap(vp); + else + error = 0; + mutex_enter(&si->si_lock); + si->si_owner = NULL; + mutex_exit(&si->si_lock); + mutex_exit(&si->si_snaplock); + return (error == 0); + } + if (lbn >= NDADDR) + brelse(ibp, 0); +#ifdef DEBUG + if (snapdebug) + printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n", + "Copyonremove: snapino ", + (unsigned long long)ip->i_number, + lbn, "for inum", (unsigned long long)inum, size); +#endif + /* + * If we have already read the old block contents, then + * simply copy them to the new block. Note that we need + * to synchronously write snapshots that have not been + * unlinked, and hence will be visible after a crash, + * to ensure their integrity. + */ + mutex_exit(&si->si_lock); + if (saved_data == NULL) { + saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); + error = rwfsblk(vp, B_READ, saved_data, lbn); + if (error) { + free(saved_data, M_UFSMNT); + saved_data = NULL; + mutex_enter(&si->si_lock); + break; + } + } + error = wrsnapblk(vp, saved_data, lbn); + if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) + error = syncsnap(vp); + mutex_enter(&si->si_lock); + if (error) + break; + if (gen != si->si_gen) + goto retry; + } + si->si_owner = NULL; + mutex_exit(&si->si_lock); + mutex_exit(&si->si_snaplock); + if (saved_data) + free(saved_data, M_UFSMNT); + /* + * If we have been unable to allocate a block in which to do + * the copy, then return non-zero so that the fragment will + * not be freed. Although space will be lost, the snapshot + * will stay consistent. + */ + return (error); +} + +/* + * Associate snapshot files when mounting. + */ +void +ffs_snapshot_mount(struct mount *mp) +{ + struct vnode *devvp = VFSTOUFS(mp)->um_devvp; + struct fs *fs = VFSTOUFS(mp)->um_fs; + struct lwp *l = curlwp; + struct vnode *vp; + struct inode *ip, *xp; + struct snap_info *si; + daddr_t snaplistsize, *snapblklist; + int i, error, ns, snaploc, loc; + + /* + * No persistent snapshots on apple ufs file systems. + */ + if (UFS_MPISAPPLEUFS(VFSTOUFS(mp))) + return; + + si = VFSTOUFS(mp)->um_snapinfo; + ns = UFS_FSNEEDSWAP(fs); + /* + * XXX The following needs to be set before ffs_truncate or + * VOP_READ can be called. + */ + mp->mnt_stat.f_iosize = fs->fs_bsize; + /* + * Process each snapshot listed in the superblock. + */ + vp = NULL; + mutex_enter(&si->si_lock); + for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { + if (fs->fs_snapinum[snaploc] == 0) + break; + if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], + &vp)) != 0) { + printf("ffs_snapshot_mount: vget failed %d\n", error); + continue; + } + ip = VTOI(vp); + if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) != + SF_SNAPSHOT) { + printf("ffs_snapshot_mount: non-snapshot inode %d\n", + fs->fs_snapinum[snaploc]); + vput(vp); + vp = NULL; + for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { + if (fs->fs_snapinum[loc] == 0) + break; + fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; + } + fs->fs_snapinum[loc - 1] = 0; + snaploc--; + continue; + } + + /* + * Read the block hints list. Use an empty list on + * read errors. + */ + error = vn_rdwr(UIO_READ, vp, + (void *)&snaplistsize, sizeof(snaplistsize), + lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, + l->l_cred, NULL, NULL); + if (error) { + printf("ffs_snapshot_mount: read_1 failed %d\n", error); + snaplistsize = 1; + } else + snaplistsize = ufs_rw64(snaplistsize, ns); + snapblklist = malloc( + snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); + if (error) + snapblklist[0] = 1; + else { + error = vn_rdwr(UIO_READ, vp, (void *)snapblklist, + snaplistsize * sizeof(daddr_t), + lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, + l->l_cred, NULL, NULL); + for (i = 0; i < snaplistsize; i++) + snapblklist[i] = ufs_rw64(snapblklist[i], ns); + if (error) { + printf("ffs_snapshot_mount: read_2 failed %d\n", + error); + snapblklist[0] = 1; + } + } + ip->i_snapblklist = &snapblklist[0]; + + /* + * Link it onto the active snapshot list. + */ + if (is_active_snapshot(si, ip)) + panic("ffs_snapshot_mount: %"PRIu64" already on list", + ip->i_number); + else + TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); + vp->v_vflag |= VV_SYSTEM; + VOP_UNLOCK(vp); + } + /* + * No usable snapshots found. + */ + if (vp == NULL) { + mutex_exit(&si->si_lock); + return; + } + /* + * Attach the block hints list. We always want to + * use the list from the newest snapshot. + */ + xp = TAILQ_LAST(&si->si_snapshots, inodelst); + si->si_snapblklist = xp->i_snapblklist; + fscow_establish(mp, ffs_copyonwrite, devvp); + si->si_gen++; + mutex_exit(&si->si_lock); +} + +/* + * Disassociate snapshot files when unmounting. + */ +void +ffs_snapshot_unmount(struct mount *mp) +{ + struct vnode *devvp = VFSTOUFS(mp)->um_devvp; + struct inode *xp; + struct vnode *vp = NULL; + struct snap_info *si; + + si = VFSTOUFS(mp)->um_snapinfo; + mutex_enter(&si->si_lock); + while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) { + vp = ITOV(xp); + TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap); + if (xp->i_snapblklist == si->si_snapblklist) + si->si_snapblklist = NULL; + free(xp->i_snapblklist, M_UFSMNT); + if (xp->i_nlink > 0) { + si->si_gen++; + mutex_exit(&si->si_lock); + vrele(vp); + mutex_enter(&si->si_lock); + } + } + si->si_gen++; + mutex_exit(&si->si_lock); + if (vp) + fscow_disestablish(mp, ffs_copyonwrite, devvp); +} + +/* + * Check for need to copy block that is about to be written, + * copying the block if necessary. + */ +static int +ffs_copyonwrite(void *v, struct buf *bp, bool data_valid) +{ + struct fs *fs; + struct inode *ip; + struct vnode *devvp = v, *vp = NULL; + struct mount *mp = devvp->v_specmountpoint; + struct snap_info *si; + void *saved_data = NULL; + daddr_t lbn, blkno, *snapblklist; + uint32_t gen; + int lower, upper, mid, snapshot_locked = 0, error = 0; + + /* + * Check for valid snapshots. + */ + si = VFSTOUFS(mp)->um_snapinfo; + mutex_enter(&si->si_lock); + ip = TAILQ_FIRST(&si->si_snapshots); + if (ip == NULL) { + mutex_exit(&si->si_lock); + return 0; + } + /* + * First check to see if it is after the file system, + * in the journal or in the preallocated list. + * By doing these checks we avoid several potential deadlocks. + */ + fs = ip->i_fs; + lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); + if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) { + mutex_exit(&si->si_lock); + return 0; + } + if ((fs->fs_flags & FS_DOWAPBL) && + fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { + off_t blk_off, log_start, log_end; + + log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] * + fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; + log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] * + fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; + blk_off = dbtob(bp->b_blkno); + if (blk_off >= log_start && blk_off < log_end) { + mutex_exit(&si->si_lock); + return 0; + } + } + snapblklist = si->si_snapblklist; + upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0); + lower = 1; + while (lower <= upper) { + mid = (lower + upper) / 2; + if (snapblklist[mid] == lbn) + break; + if (snapblklist[mid] < lbn) + lower = mid + 1; + else + upper = mid - 1; + } + if (lower <= upper) { + mutex_exit(&si->si_lock); + return 0; + } + /* + * Not in the precomputed list, so check the snapshots. + */ + if (si->si_owner != curlwp) { + if (!mutex_tryenter(&si->si_snaplock)) { + mutex_exit(&si->si_lock); + mutex_enter(&si->si_snaplock); + mutex_enter(&si->si_lock); + } + si->si_owner = curlwp; + snapshot_locked = 1; + } + if (data_valid && bp->b_bcount == fs->fs_bsize) + saved_data = bp->b_data; +retry: + gen = si->si_gen; + TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { + vp = ITOV(ip); + /* + * We ensure that everything of our own that needs to be + * copied will be done at the time that ffs_snapshot is + * called. Thus we can skip the check here which can + * deadlock in doing the lookup in ffs_balloc. + */ + if (bp->b_vp == vp) + continue; + /* + * Check to see if block needs to be copied. + */ + if (lbn < NDADDR) { + blkno = db_get(ip, lbn); + } else { + mutex_exit(&si->si_lock); + blkno = 0; /* XXX: GCC */ + if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { + mutex_enter(&si->si_lock); + break; + } + mutex_enter(&si->si_lock); + if (gen != si->si_gen) + goto retry; + } +#ifdef DIAGNOSTIC + if (blkno == BLK_SNAP && bp->b_lblkno >= 0) + panic("ffs_copyonwrite: bad copy block"); +#endif + if (blkno != 0) + continue; + + if (curlwp == uvm.pagedaemon_lwp) { + error = ENOMEM; + break; + } + /* Only one level of recursion allowed. */ + KASSERT(snapshot_locked); + /* + * Allocate the block into which to do the copy. Since + * multiple processes may all try to copy the same block, + * we have to recheck our need to do a copy if we sleep + * waiting for the lock. + * + * Because all snapshots on a filesystem share a single + * lock, we ensure that we will never be in competition + * with another process to allocate a block. + */ +#ifdef DEBUG + if (snapdebug) { + printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ", + (unsigned long long)ip->i_number, lbn); + if (bp->b_vp == devvp) + printf("fs metadata"); + else + printf("inum %llu", (unsigned long long) + VTOI(bp->b_vp)->i_number); + printf(" lblkno %" PRId64 "\n", bp->b_lblkno); + } +#endif + /* + * If we have already read the old block contents, then + * simply copy them to the new block. Note that we need + * to synchronously write snapshots that have not been + * unlinked, and hence will be visible after a crash, + * to ensure their integrity. + */ + mutex_exit(&si->si_lock); + if (saved_data == NULL) { + saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); + error = rwfsblk(vp, B_READ, saved_data, lbn); + if (error) { + free(saved_data, M_UFSMNT); + saved_data = NULL; + mutex_enter(&si->si_lock); + break; + } + } + error = wrsnapblk(vp, saved_data, lbn); + if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) + error = syncsnap(vp); + mutex_enter(&si->si_lock); + if (error) + break; + if (gen != si->si_gen) + goto retry; + } + /* + * Note that we need to synchronously write snapshots that + * have not been unlinked, and hence will be visible after + * a crash, to ensure their integrity. + */ + if (snapshot_locked) { + si->si_owner = NULL; + mutex_exit(&si->si_lock); + mutex_exit(&si->si_snaplock); + } else + mutex_exit(&si->si_lock); + if (saved_data && saved_data != bp->b_data) + free(saved_data, M_UFSMNT); + return error; +} + +/* + * Read from a snapshot. + */ +int +ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag) +{ + struct inode *ip = VTOI(vp); + struct fs *fs = ip->i_fs; + struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo; + struct buf *bp; + daddr_t lbn, nextlbn; + off_t fsbytes, bytesinfile; + long size, xfersize, blkoffset; + int error; + + fstrans_start(vp->v_mount, FSTRANS_SHARED); + mutex_enter(&si->si_snaplock); + + if (ioflag & IO_ALTSEMANTICS) + fsbytes = ip->i_size; + else + fsbytes = lfragtosize(fs, fs->fs_size); + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { + bytesinfile = fsbytes - uio->uio_offset; + if (bytesinfile <= 0) + break; + lbn = lblkno(fs, uio->uio_offset); + nextlbn = lbn + 1; + size = fs->fs_bsize; + blkoffset = blkoff(fs, uio->uio_offset); + xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), + bytesinfile); + + if (lblktosize(fs, nextlbn + 1) >= fsbytes) { + if (lblktosize(fs, lbn) + size > fsbytes) + size = fragroundup(fs, + fsbytes - lblktosize(fs, lbn)); + error = bread(vp, lbn, size, NOCRED, 0, &bp); + } else { + int nextsize = fs->fs_bsize; + error = breadn(vp, lbn, + size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp); + } + if (error) + break; + + /* + * We should only get non-zero b_resid when an I/O error + * has occurred, which should cause us to break above. + * However, if the short read did not cause an error, + * then we want to ensure that we do not uiomove bad + * or uninitialized data. + */ + size -= bp->b_resid; + if (size < blkoffset + xfersize) { + xfersize = size - blkoffset; + if (xfersize <= 0) + break; + } + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); + if (error) + break; + brelse(bp, BC_AGE); + } + if (bp != NULL) + brelse(bp, BC_AGE); + + mutex_exit(&si->si_snaplock); + fstrans_done(vp->v_mount); + return error; +} + +/* + * Lookup a snapshots data block address. + * Simpler than UFS_BALLOC() as we know all metadata is already allocated + * and safe even for the pagedaemon where we cannot bread(). + */ +static int +snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res) +{ + struct indir indirs[NIADDR + 2]; + struct inode *ip = VTOI(vp); + struct fs *fs = ip->i_fs; + struct buf *bp; + int error, num; + + KASSERT(lbn >= 0); + + if (lbn < NDADDR) { + *res = db_get(ip, lbn); + return 0; + } + if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) + return error; + if (curlwp == uvm.pagedaemon_lwp) { + mutex_enter(&bufcache_lock); + bp = incore(vp, indirs[num-1].in_lbn); + if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) { + *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); + error = 0; + } else + error = ENOMEM; + mutex_exit(&bufcache_lock); + return error; + } + error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp); + if (error == 0) + *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); + brelse(bp, 0); + + return error; +} + +/* + * Read or write the specified block of the filesystem vp resides on + * from or to the disk bypassing the buffer cache. + */ +static int +rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn) +{ + int error; + struct inode *ip = VTOI(vp); + struct fs *fs = ip->i_fs; + struct buf *nbp; + + nbp = getiobuf(NULL, true); + nbp->b_flags = flags; + nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; + nbp->b_error = 0; + nbp->b_data = data; + nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn)); + nbp->b_proc = NULL; + nbp->b_dev = ip->i_devvp->v_rdev; + SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */ + + bdev_strategy(nbp); + + error = biowait(nbp); + + putiobuf(nbp); + + return error; +} + +/* + * Write all dirty buffers to disk and invalidate them. + */ +static int +syncsnap(struct vnode *vp) +{ + int error; + buf_t *bp; + struct fs *fs = VTOI(vp)->i_fs; + + mutex_enter(&bufcache_lock); + while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) { + error = bbusy(bp, false, 0, NULL); + if (error == EPASSTHROUGH) + continue; + else if (error != 0) { + mutex_exit(&bufcache_lock); + return error; + } + KASSERT(bp->b_bcount == fs->fs_bsize); + mutex_exit(&bufcache_lock); + error = rwfsblk(vp, B_WRITE, bp->b_data, + fragstoblks(fs, dbtofsb(fs, bp->b_blkno))); + brelse(bp, BC_INVAL | BC_VFLUSH); + if (error) + return error; + mutex_enter(&bufcache_lock); + } + mutex_exit(&bufcache_lock); + + return 0; +} + +/* + * Write the specified block to a snapshot. + */ +static int +wrsnapblk(struct vnode *vp, void *data, daddr_t lbn) +{ + struct inode *ip = VTOI(vp); + struct fs *fs = ip->i_fs; + struct buf *bp; + int error; + + error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, + FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp); + if (error) + return error; + memcpy(bp->b_data, data, fs->fs_bsize); + if (ip->i_nlink > 0) + error = bwrite(bp); + else + bawrite(bp); + + return error; +} + +/* + * Check if this inode is present on the active snapshot list. + * Must be called with snapinfo locked. + */ +static inline bool +is_active_snapshot(struct snap_info *si, struct inode *ip) +{ + struct inode *xp; + + KASSERT(mutex_owned(&si->si_lock)); + + TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) + if (xp == ip) + return true; + return false; +} + +/* + * Get/Put direct block from inode or buffer containing disk addresses. Take + * care for fs type (UFS1/UFS2) and byte swapping. These functions should go + * into a global include. + */ +static inline daddr_t +db_get(struct inode *ip, int loc) +{ + if (ip->i_ump->um_fstype == UFS1) + return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); + else + return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); +} + +static inline void +db_assign(struct inode *ip, int loc, daddr_t val) +{ + if (ip->i_ump->um_fstype == UFS1) + ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); + else + ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); +} + +static inline daddr_t +ib_get(struct inode *ip, int loc) +{ + if (ip->i_ump->um_fstype == UFS1) + return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip)); + else + return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip)); +} + +static inline void +ib_assign(struct inode *ip, int loc, daddr_t val) +{ + if (ip->i_ump->um_fstype == UFS1) + ip->i_ffs1_ib[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); + else + ip->i_ffs2_ib[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); +} + +static inline daddr_t +idb_get(struct inode *ip, void *bf, int loc) +{ + if (ip->i_ump->um_fstype == UFS1) + return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); + else + return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); +} + +static inline void +idb_assign(struct inode *ip, void *bf, int loc, daddr_t val) +{ + if (ip->i_ump->um_fstype == UFS1) + ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); + else + ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); +} diff --git a/sys/ufs/ffs/ffs_subr.c b/sys/ufs/ffs/ffs_subr.c new file mode 100644 index 000000000..6b6840357 --- /dev/null +++ b/sys/ufs/ffs/ffs_subr.c @@ -0,0 +1,371 @@ +/* $NetBSD: ffs_subr.c,v 1.47 2011/08/14 12:37:09 christos Exp $ */ + +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_subr.c 8.5 (Berkeley) 3/21/95 + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +__KERNEL_RCSID(0, "$NetBSD: ffs_subr.c,v 1.47 2011/08/14 12:37:09 christos Exp $"); + +#include + +/* in ffs_tables.c */ +extern const int inside[], around[]; +extern const u_char * const fragtbl[]; + +#ifndef _KERNEL +#define FFS_EI /* always include byteswapped filesystems support */ +#endif +#include +#include +#include + +#ifndef _KERNEL +#include +void panic(const char *, ...) + __attribute__((__noreturn__,__format__(__printf__,1,2))); + +#else /* _KERNEL */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Load up the contents of an inode and copy the appropriate pieces + * to the incore copy. + */ +void +ffs_load_inode(struct buf *bp, struct inode *ip, struct fs *fs, ino_t ino) +{ + struct ufs1_dinode *dp1; + struct ufs2_dinode *dp2; + + if (ip->i_ump->um_fstype == UFS1) { + dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino); +#ifdef FFS_EI + if (UFS_FSNEEDSWAP(fs)) + ffs_dinode1_swap(dp1, ip->i_din.ffs1_din); + else +#endif + *ip->i_din.ffs1_din = *dp1; + + ip->i_mode = ip->i_ffs1_mode; + ip->i_nlink = ip->i_ffs1_nlink; + ip->i_size = ip->i_ffs1_size; + ip->i_flags = ip->i_ffs1_flags; + ip->i_gen = ip->i_ffs1_gen; + ip->i_uid = ip->i_ffs1_uid; + ip->i_gid = ip->i_ffs1_gid; + } else { + dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino); +#ifdef FFS_EI + if (UFS_FSNEEDSWAP(fs)) + ffs_dinode2_swap(dp2, ip->i_din.ffs2_din); + else +#endif + *ip->i_din.ffs2_din = *dp2; + + ip->i_mode = ip->i_ffs2_mode; + ip->i_nlink = ip->i_ffs2_nlink; + ip->i_size = ip->i_ffs2_size; + ip->i_flags = ip->i_ffs2_flags; + ip->i_gen = ip->i_ffs2_gen; + ip->i_uid = ip->i_ffs2_uid; + ip->i_gid = ip->i_ffs2_gid; + } +} + +int +ffs_getblk(struct vnode *vp, daddr_t lblkno, daddr_t blkno, int size, + bool clearbuf, buf_t **bpp) +{ + int error = 0; + + KASSERT(blkno >= 0 || blkno == FFS_NOBLK); + + if ((*bpp = getblk(vp, lblkno, size, 0, 0)) == NULL) + return ENOMEM; + if (blkno != FFS_NOBLK) + (*bpp)->b_blkno = blkno; + if (clearbuf) + clrbuf(*bpp); + if ((*bpp)->b_blkno >= 0 && (error = fscow_run(*bpp, false)) != 0) + brelse(*bpp, BC_INVAL); + return error; +} + +#endif /* _KERNEL */ + +/* + * Update the frsum fields to reflect addition or deletion + * of some frags. + */ +void +ffs_fragacct(struct fs *fs, int fragmap, int32_t fraglist[], int cnt, + int needswap) +{ + int inblk; + int field, subfield; + int siz, pos; + + inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1; + fragmap <<= 1; + for (siz = 1; siz < fs->fs_frag; siz++) { + if ((inblk & (1 << (siz + (fs->fs_frag & (NBBY - 1))))) == 0) + continue; + field = around[siz]; + subfield = inside[siz]; + for (pos = siz; pos <= fs->fs_frag; pos++) { + if ((fragmap & field) == subfield) { + fraglist[siz] = ufs_rw32( + ufs_rw32(fraglist[siz], needswap) + cnt, + needswap); + pos += siz; + field <<= siz; + subfield <<= siz; + } + field <<= 1; + subfield <<= 1; + } + } +} + +/* + * block operations + * + * check if a block is available + * returns true if all the correponding bits in the free map are 1 + * returns false if any corresponding bit in the free map is 0 + */ +int +ffs_isblock(struct fs *fs, u_char *cp, int32_t h) +{ + u_char mask; + + switch ((int)fs->fs_fragshift) { + case 3: + return (cp[h] == 0xff); + case 2: + mask = 0x0f << ((h & 0x1) << 2); + return ((cp[h >> 1] & mask) == mask); + case 1: + mask = 0x03 << ((h & 0x3) << 1); + return ((cp[h >> 2] & mask) == mask); + case 0: + mask = 0x01 << (h & 0x7); + return ((cp[h >> 3] & mask) == mask); + default: + panic("ffs_isblock: unknown fs_fragshift %d", + (int)fs->fs_fragshift); + } +} + +/* + * check if a block is completely allocated + * returns true if all the corresponding bits in the free map are 0 + * returns false if any corresponding bit in the free map is 1 + */ +int +ffs_isfreeblock(struct fs *fs, u_char *cp, int32_t h) +{ + + switch ((int)fs->fs_fragshift) { + case 3: + return (cp[h] == 0); + case 2: + return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0); + case 1: + return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0); + case 0: + return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0); + default: + panic("ffs_isfreeblock: unknown fs_fragshift %d", + (int)fs->fs_fragshift); + } +} + +/* + * take a block out of the map + */ +void +ffs_clrblock(struct fs *fs, u_char *cp, int32_t h) +{ + + switch ((int)fs->fs_fragshift) { + case 3: + cp[h] = 0; + return; + case 2: + cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); + return; + case 1: + cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); + return; + case 0: + cp[h >> 3] &= ~(0x01 << (h & 0x7)); + return; + default: + panic("ffs_clrblock: unknown fs_fragshift %d", + (int)fs->fs_fragshift); + } +} + +/* + * put a block into the map + */ +void +ffs_setblock(struct fs *fs, u_char *cp, int32_t h) +{ + + switch ((int)fs->fs_fragshift) { + case 3: + cp[h] = 0xff; + return; + case 2: + cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); + return; + case 1: + cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); + return; + case 0: + cp[h >> 3] |= (0x01 << (h & 0x7)); + return; + default: + panic("ffs_setblock: unknown fs_fragshift %d", + (int)fs->fs_fragshift); + } +} + +/* + * Update the cluster map because of an allocation or free. + * + * Cnt == 1 means free; cnt == -1 means allocating. + */ +void +ffs_clusteracct(struct fs *fs, struct cg *cgp, int32_t blkno, int cnt) +{ + int32_t *sump; + int32_t *lp; + u_char *freemapp, *mapp; + int i, start, end, forw, back, map, bit; +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + + /* KASSERT(mutex_owned(&ump->um_lock)); */ + + if (fs->fs_contigsumsize <= 0) + return; + freemapp = cg_clustersfree(cgp, needswap); + sump = cg_clustersum(cgp, needswap); + /* + * Allocate or clear the actual block. + */ + if (cnt > 0) + setbit(freemapp, blkno); + else + clrbit(freemapp, blkno); + /* + * Find the size of the cluster going forward. + */ + start = blkno + 1; + end = start + fs->fs_contigsumsize; + if ((uint32_t)end >= ufs_rw32(cgp->cg_nclusterblks, needswap)) + end = ufs_rw32(cgp->cg_nclusterblks, needswap); + mapp = &freemapp[start / NBBY]; + map = *mapp++; + bit = 1 << (start % NBBY); + for (i = start; i < end; i++) { + if ((map & bit) == 0) + break; + if ((i & (NBBY - 1)) != (NBBY - 1)) { + bit <<= 1; + } else { + map = *mapp++; + bit = 1; + } + } + forw = i - start; + /* + * Find the size of the cluster going backward. + */ + start = blkno - 1; + end = start - fs->fs_contigsumsize; + if (end < 0) + end = -1; + mapp = &freemapp[start / NBBY]; + map = *mapp--; + bit = 1 << (start % NBBY); + for (i = start; i > end; i--) { + if ((map & bit) == 0) + break; + if ((i & (NBBY - 1)) != 0) { + bit >>= 1; + } else { + map = *mapp--; + bit = 1 << (NBBY - 1); + } + } + back = start - i; + /* + * Account for old cluster and the possibly new forward and + * back clusters. + */ + i = back + forw + 1; + if (i > fs->fs_contigsumsize) + i = fs->fs_contigsumsize; + ufs_add32(sump[i], cnt, needswap); + if (back > 0) + ufs_add32(sump[back], -cnt, needswap); + if (forw > 0) + ufs_add32(sump[forw], -cnt, needswap); + + /* + * Update cluster summary information. + */ + lp = &sump[fs->fs_contigsumsize]; + for (i = fs->fs_contigsumsize; i > 0; i--) + if (ufs_rw32(*lp--, needswap) > 0) + break; +#if defined(_KERNEL) + fs->fs_maxcluster[ufs_rw32(cgp->cg_cgx, needswap)] = i; +#endif +} diff --git a/sys/ufs/ffs/ffs_tables.c b/sys/ufs/ffs/ffs_tables.c new file mode 100644 index 000000000..29f454247 --- /dev/null +++ b/sys/ufs/ffs/ffs_tables.c @@ -0,0 +1,141 @@ +/* $NetBSD: ffs_tables.c,v 1.9 2005/12/11 12:25:25 christos Exp $ */ + +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_tables.c 8.1 (Berkeley) 6/11/93 + */ + +#if HAVE_NBTOOL_CONFIG_H +#include "nbtool_config.h" +#endif + +#include +__KERNEL_RCSID(0, "$NetBSD: ffs_tables.c,v 1.9 2005/12/11 12:25:25 christos Exp $"); + +#include + +/* + * Bit patterns for identifying fragments in the block map + * used as ((map & around) == inside) + */ +const int around[9] = { + 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff +}; +const int inside[9] = { + 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe +}; + +/* + * Given a block map bit pattern, the frag tables tell whether a + * particular size fragment is available. + * + * used as: + * if ((1 << (size - 1)) & fragtbl[fs->fs_frag][map] { + * at least one fragment of the indicated size is available + * } + * + * These tables are used by the scanc instruction on the VAX to + * quickly find an appropriate fragment. + */ +const u_char fragtbl124[256] = { + 0x00, 0x16, 0x16, 0x2a, 0x16, 0x16, 0x26, 0x4e, + 0x16, 0x16, 0x16, 0x3e, 0x2a, 0x3e, 0x4e, 0x8a, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x26, 0x36, 0x36, 0x2e, 0x36, 0x36, 0x26, 0x6e, + 0x36, 0x36, 0x36, 0x3e, 0x2e, 0x3e, 0x6e, 0xae, + 0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e, + 0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe, + 0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe, + 0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e, + 0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce, + 0x8a, 0x9e, 0x9e, 0xaa, 0x9e, 0x9e, 0xae, 0xce, + 0x9e, 0x9e, 0x9e, 0xbe, 0xaa, 0xbe, 0xce, 0x8a, +}; + +const u_char fragtbl8[256] = { + 0x00, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x04, + 0x01, 0x01, 0x01, 0x03, 0x02, 0x03, 0x04, 0x08, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x02, 0x03, 0x03, 0x02, 0x04, 0x05, 0x08, 0x10, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x04, 0x05, 0x05, 0x06, 0x08, 0x09, 0x10, 0x20, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a, + 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x10, 0x11, 0x20, 0x40, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, + 0x05, 0x05, 0x05, 0x07, 0x09, 0x09, 0x11, 0x21, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, + 0x02, 0x03, 0x03, 0x02, 0x06, 0x07, 0x0a, 0x12, + 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, + 0x05, 0x05, 0x05, 0x07, 0x06, 0x07, 0x04, 0x0c, + 0x08, 0x09, 0x09, 0x0a, 0x09, 0x09, 0x0a, 0x0c, + 0x10, 0x11, 0x11, 0x12, 0x20, 0x21, 0x40, 0x80, +}; + +/* + * The actual fragtbl array. + */ +const u_char * const fragtbl[MAXFRAG + 1] = { + 0, fragtbl124, fragtbl124, 0, fragtbl124, 0, 0, 0, fragtbl8, +}; diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c new file mode 100644 index 000000000..28bbe32dc --- /dev/null +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -0,0 +1,2144 @@ +/* $NetBSD: ffs_vfsops.c,v 1.271 2011/11/14 18:35:14 hannken Exp $ */ + +/*- + * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc, and by Andrew Doran. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1989, 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.271 2011/11/14 18:35:14 hannken Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_ffs.h" +#include "opt_quota.h" +#include "opt_wapbl.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE(MODULE_CLASS_VFS, ffs, NULL); + +static int ffs_vfs_fsync(vnode_t *, int); + +static struct sysctllog *ffs_sysctl_log; + +/* how many times ffs_init() was called */ +int ffs_initcount = 0; + +extern const struct vnodeopv_desc ffs_vnodeop_opv_desc; +extern const struct vnodeopv_desc ffs_specop_opv_desc; +extern const struct vnodeopv_desc ffs_fifoop_opv_desc; + +const struct vnodeopv_desc * const ffs_vnodeopv_descs[] = { + &ffs_vnodeop_opv_desc, + &ffs_specop_opv_desc, + &ffs_fifoop_opv_desc, + NULL, +}; + +struct vfsops ffs_vfsops = { + MOUNT_FFS, + sizeof (struct ufs_args), + ffs_mount, + ufs_start, + ffs_unmount, + ufs_root, + ufs_quotactl, + ffs_statvfs, + ffs_sync, + ffs_vget, + ffs_fhtovp, + ffs_vptofh, + ffs_init, + ffs_reinit, + ffs_done, + ffs_mountroot, + ffs_snapshot, + ffs_extattrctl, + ffs_suspendctl, + genfs_renamelock_enter, + genfs_renamelock_exit, + ffs_vfs_fsync, + ffs_vnodeopv_descs, + 0, + { NULL, NULL }, +}; + +static const struct genfs_ops ffs_genfsops = { + .gop_size = ffs_gop_size, + .gop_alloc = ufs_gop_alloc, + .gop_write = genfs_gop_write, + .gop_markupdate = ufs_gop_markupdate, +}; + +static const struct ufs_ops ffs_ufsops = { + .uo_itimes = ffs_itimes, + .uo_update = ffs_update, + .uo_truncate = ffs_truncate, + .uo_valloc = ffs_valloc, + .uo_vfree = ffs_vfree, + .uo_balloc = ffs_balloc, + .uo_unmark_vnode = (void (*)(vnode_t *))nullop, +}; + +static int +ffs_modcmd(modcmd_t cmd, void *arg) +{ + int error; + +#if 0 + extern int doasyncfree; +#endif +#ifdef UFS_EXTATTR + extern int ufs_extattr_autocreate; +#endif + extern int ffs_log_changeopt; + + switch (cmd) { + case MODULE_CMD_INIT: + error = vfs_attach(&ffs_vfsops); + if (error != 0) + break; + + sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "vfs", NULL, + NULL, 0, NULL, 0, + CTL_VFS, CTL_EOL); + sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "ffs", + SYSCTL_DESCR("Berkeley Fast File System"), + NULL, 0, NULL, 0, + CTL_VFS, 1, CTL_EOL); + /* + * @@@ should we even bother with these first three? + */ + sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "doclusterread", NULL, + sysctl_notavail, 0, NULL, 0, + CTL_VFS, 1, FFS_CLUSTERREAD, CTL_EOL); + sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "doclusterwrite", NULL, + sysctl_notavail, 0, NULL, 0, + CTL_VFS, 1, FFS_CLUSTERWRITE, CTL_EOL); + sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "doreallocblks", NULL, + sysctl_notavail, 0, NULL, 0, + CTL_VFS, 1, FFS_REALLOCBLKS, CTL_EOL); +#if 0 + sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "doasyncfree", + SYSCTL_DESCR("Release dirty blocks asynchronously"), + NULL, 0, &doasyncfree, 0, + CTL_VFS, 1, FFS_ASYNCFREE, CTL_EOL); +#endif + sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "log_changeopt", + SYSCTL_DESCR("Log changes in optimization strategy"), + NULL, 0, &ffs_log_changeopt, 0, + CTL_VFS, 1, FFS_LOG_CHANGEOPT, CTL_EOL); +#ifdef UFS_EXTATTR + sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "extattr_autocreate", + SYSCTL_DESCR("Size of attribute for " + "backing file autocreation"), + NULL, 0, &ufs_extattr_autocreate, 0, + CTL_VFS, 1, FFS_EXTATTR_AUTOCREATE, CTL_EOL); + +#endif /* UFS_EXTATTR */ + + break; + case MODULE_CMD_FINI: + error = vfs_detach(&ffs_vfsops); + if (error != 0) + break; + sysctl_teardown(&ffs_sysctl_log); + break; + default: + error = ENOTTY; + break; + } + + return (error); +} + +pool_cache_t ffs_inode_cache; +pool_cache_t ffs_dinode1_cache; +pool_cache_t ffs_dinode2_cache; + +static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, daddr_t); +static void ffs_oldfscompat_write(struct fs *, struct ufsmount *); + +/* + * Called by main() when ffs is going to be mounted as root. + */ + +int +ffs_mountroot(void) +{ + struct fs *fs; + struct mount *mp; + struct lwp *l = curlwp; /* XXX */ + struct ufsmount *ump; + int error; + + if (device_class(root_device) != DV_DISK) + return (ENODEV); + + if ((error = vfs_rootmountalloc(MOUNT_FFS, "root_device", &mp))) { + vrele(rootvp); + return (error); + } + + /* + * We always need to be able to mount the root file system. + */ + mp->mnt_flag |= MNT_FORCE; + if ((error = ffs_mountfs(rootvp, mp, l)) != 0) { + vfs_unbusy(mp, false, NULL); + vfs_destroy(mp); + return (error); + } + mp->mnt_flag &= ~MNT_FORCE; + mutex_enter(&mountlist_lock); + CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mutex_exit(&mountlist_lock); + ump = VFSTOUFS(mp); + fs = ump->um_fs; + memset(fs->fs_fsmnt, 0, sizeof(fs->fs_fsmnt)); + (void)copystr(mp->mnt_stat.f_mntonname, fs->fs_fsmnt, MNAMELEN - 1, 0); + (void)ffs_statvfs(mp, &mp->mnt_stat); + vfs_unbusy(mp, false, NULL); + setrootfstime((time_t)fs->fs_time); + return (0); +} + +/* + * VFS Operations. + * + * mount system call + */ +int +ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) +{ + struct lwp *l = curlwp; + struct vnode *devvp = NULL; + struct ufs_args *args = data; + struct ufsmount *ump = NULL; + struct fs *fs; + int error = 0, flags, update; + mode_t accessmode; + + if (*data_len < sizeof *args) + return EINVAL; + + if (mp->mnt_flag & MNT_GETARGS) { + ump = VFSTOUFS(mp); + if (ump == NULL) + return EIO; + args->fspec = NULL; + *data_len = sizeof *args; + return 0; + } + + update = mp->mnt_flag & MNT_UPDATE; + + /* Check arguments */ + if (args->fspec != NULL) { + /* + * Look up the name and verify that it's sane. + */ + error = namei_simple_user(args->fspec, + NSM_FOLLOW_NOEMULROOT, &devvp); + if (error != 0) + return (error); + + if (!update) { + /* + * Be sure this is a valid block device + */ + if (devvp->v_type != VBLK) + error = ENOTBLK; + else if (bdevsw_lookup(devvp->v_rdev) == NULL) + error = ENXIO; + } else { + /* + * Be sure we're still naming the same device + * used for our initial mount + */ + ump = VFSTOUFS(mp); + if (devvp != ump->um_devvp) { + if (devvp->v_rdev != ump->um_devvp->v_rdev) + error = EINVAL; + else { + vrele(devvp); + devvp = ump->um_devvp; + vref(devvp); + } + } + } + } else { + if (!update) { + /* New mounts must have a filename for the device */ + return (EINVAL); + } else { + /* Use the extant mount */ + ump = VFSTOUFS(mp); + devvp = ump->um_devvp; + vref(devvp); + } + } + + /* + * If mount by non-root, then verify that user has necessary + * permissions on the device. + * + * Permission to update a mount is checked higher, so here we presume + * updating the mount is okay (for example, as far as securelevel goes) + * which leaves us with the normal check. + */ + if (error == 0) { + accessmode = VREAD; + if (update ? + (mp->mnt_iflag & IMNT_WANTRDWR) != 0 : + (mp->mnt_flag & MNT_RDONLY) == 0) + accessmode |= VWRITE; + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = genfs_can_mount(devvp, accessmode, l->l_cred); + VOP_UNLOCK(devvp); + } + + if (error) { + vrele(devvp); + return (error); + } + +#ifdef WAPBL + /* WAPBL can only be enabled on a r/w mount. */ + if ((mp->mnt_flag & MNT_RDONLY) && !(mp->mnt_iflag & IMNT_WANTRDWR)) { + mp->mnt_flag &= ~MNT_LOG; + } +#else /* !WAPBL */ + mp->mnt_flag &= ~MNT_LOG; +#endif /* !WAPBL */ + + if (!update) { + int xflags; + + if (mp->mnt_flag & MNT_RDONLY) + xflags = FREAD; + else + xflags = FREAD | FWRITE; + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_OPEN(devvp, xflags, FSCRED); + VOP_UNLOCK(devvp); + if (error) + goto fail; + error = ffs_mountfs(devvp, mp, l); + if (error) { + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + (void)VOP_CLOSE(devvp, xflags, NOCRED); + VOP_UNLOCK(devvp); + goto fail; + } + + ump = VFSTOUFS(mp); + fs = ump->um_fs; + } else { + /* + * Update the mount. + */ + + /* + * The initial mount got a reference on this + * device, so drop the one obtained via + * namei(), above. + */ + vrele(devvp); + + ump = VFSTOUFS(mp); + fs = ump->um_fs; + if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { + /* + * Changing from r/w to r/o + */ + flags = WRITECLOSE; + if (mp->mnt_flag & MNT_FORCE) + flags |= FORCECLOSE; + error = ffs_flushfiles(mp, flags, l); + if (error == 0) + error = UFS_WAPBL_BEGIN(mp); + if (error == 0 && + ffs_cgupdate(ump, MNT_WAIT) == 0 && + fs->fs_clean & FS_WASCLEAN) { + if (mp->mnt_flag & MNT_SOFTDEP) + fs->fs_flags &= ~FS_DOSOFTDEP; + fs->fs_clean = FS_ISCLEAN; + (void) ffs_sbupdate(ump, MNT_WAIT); + } + if (error == 0) + UFS_WAPBL_END(mp); + if (error) + return (error); + } + +#ifdef WAPBL + if ((mp->mnt_flag & MNT_LOG) == 0) { + error = ffs_wapbl_stop(mp, mp->mnt_flag & MNT_FORCE); + if (error) + return error; + } +#endif /* WAPBL */ + + if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { + /* + * Finish change from r/w to r/o + */ + fs->fs_ronly = 1; + fs->fs_fmod = 0; + } + + if (mp->mnt_flag & MNT_RELOAD) { + error = ffs_reload(mp, l->l_cred, l); + if (error) + return (error); + } + + if (fs->fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) { + /* + * Changing from read-only to read/write + */ +#ifndef QUOTA2 + if (fs->fs_flags & FS_DOQUOTA2) { + ump->um_flags |= UFS_QUOTA2; + uprintf("%s: options QUOTA2 not enabled%s\n", + mp->mnt_stat.f_mntonname, + (mp->mnt_flag & MNT_FORCE) ? "" : + ", not mounting"); + return EINVAL; + } +#endif + fs->fs_ronly = 0; + fs->fs_clean <<= 1; + fs->fs_fmod = 1; +#ifdef WAPBL + if (fs->fs_flags & FS_DOWAPBL) { + printf("%s: replaying log to disk\n", + fs->fs_fsmnt); + KDASSERT(mp->mnt_wapbl_replay); + error = wapbl_replay_write(mp->mnt_wapbl_replay, + devvp); + if (error) { + return error; + } + wapbl_replay_stop(mp->mnt_wapbl_replay); + fs->fs_clean = FS_WASCLEAN; + } +#endif /* WAPBL */ + if (fs->fs_snapinum[0] != 0) + ffs_snapshot_mount(mp); + } + +#ifdef WAPBL + error = ffs_wapbl_start(mp); + if (error) + return error; +#endif /* WAPBL */ + +#ifdef QUOTA2 + if (!fs->fs_ronly) { + error = ffs_quota2_mount(mp); + if (error) { + return error; + } + } +#endif + if (args->fspec == NULL) + return 0; + } + + error = set_statvfs_info(path, UIO_USERSPACE, args->fspec, + UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); + if (error == 0) + (void)strncpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, + sizeof(fs->fs_fsmnt)); + fs->fs_flags &= ~FS_DOSOFTDEP; + if (fs->fs_fmod != 0) { /* XXX */ + int err; + + fs->fs_fmod = 0; + if (fs->fs_clean & FS_WASCLEAN) + fs->fs_time = time_second; + else { + printf("%s: file system not clean (fs_clean=%#x); " + "please fsck(8)\n", mp->mnt_stat.f_mntfromname, + fs->fs_clean); + printf("%s: lost blocks %" PRId64 " files %d\n", + mp->mnt_stat.f_mntfromname, fs->fs_pendingblocks, + fs->fs_pendinginodes); + } + err = UFS_WAPBL_BEGIN(mp); + if (err == 0) { + (void) ffs_cgupdate(ump, MNT_WAIT); + UFS_WAPBL_END(mp); + } + } + if ((mp->mnt_flag & MNT_SOFTDEP) != 0) { + printf("%s: `-o softdep' is no longer supported, " + "consider `-o log'\n", mp->mnt_stat.f_mntfromname); + mp->mnt_flag &= ~MNT_SOFTDEP; + } + + return (error); + +fail: + vrele(devvp); + return (error); +} + +/* + * Reload all incore data for a filesystem (used after running fsck on + * the root filesystem and finding things to fix). The filesystem must + * be mounted read-only. + * + * Things to do to update the mount: + * 1) invalidate all cached meta-data. + * 2) re-read superblock from disk. + * 3) re-read summary information from disk. + * 4) invalidate all inactive vnodes. + * 5) invalidate all cached file data. + * 6) re-read inode data for all active vnodes. + */ +int +ffs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l) +{ + struct vnode *vp, *mvp, *devvp; + struct inode *ip; + void *space; + struct buf *bp; + struct fs *fs, *newfs; + struct dkwedge_info dkw; + int i, bsize, blks, error; + int32_t *lp; + struct ufsmount *ump; + daddr_t sblockloc; + + if ((mp->mnt_flag & MNT_RDONLY) == 0) + return (EINVAL); + + ump = VFSTOUFS(mp); + /* + * Step 1: invalidate all cached meta-data. + */ + devvp = ump->um_devvp; + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = vinvalbuf(devvp, 0, cred, l, 0, 0); + VOP_UNLOCK(devvp); + if (error) + panic("ffs_reload: dirty1"); + /* + * Step 2: re-read superblock from disk. + */ + fs = ump->um_fs; + + /* XXX we don't handle possibility that superblock moved. */ + error = bread(devvp, fs->fs_sblockloc / DEV_BSIZE, fs->fs_sbsize, + NOCRED, 0, &bp); + if (error) { + brelse(bp, 0); + return (error); + } + newfs = malloc(fs->fs_sbsize, M_UFSMNT, M_WAITOK); + memcpy(newfs, bp->b_data, fs->fs_sbsize); +#ifdef FFS_EI + if (ump->um_flags & UFS_NEEDSWAP) { + ffs_sb_swap((struct fs*)bp->b_data, newfs); + fs->fs_flags |= FS_SWAPPED; + } else +#endif + fs->fs_flags &= ~FS_SWAPPED; + if ((newfs->fs_magic != FS_UFS1_MAGIC && + newfs->fs_magic != FS_UFS2_MAGIC)|| + newfs->fs_bsize > MAXBSIZE || + newfs->fs_bsize < sizeof(struct fs)) { + brelse(bp, 0); + free(newfs, M_UFSMNT); + return (EIO); /* XXX needs translation */ + } + /* Store off old fs_sblockloc for fs_oldfscompat_read. */ + sblockloc = fs->fs_sblockloc; + /* + * Copy pointer fields back into superblock before copying in XXX + * new superblock. These should really be in the ufsmount. XXX + * Note that important parameters (eg fs_ncg) are unchanged. + */ + newfs->fs_csp = fs->fs_csp; + newfs->fs_maxcluster = fs->fs_maxcluster; + newfs->fs_contigdirs = fs->fs_contigdirs; + newfs->fs_ronly = fs->fs_ronly; + newfs->fs_active = fs->fs_active; + memcpy(fs, newfs, (u_int)fs->fs_sbsize); + brelse(bp, 0); + free(newfs, M_UFSMNT); + + /* Recheck for apple UFS filesystem */ + ump->um_flags &= ~UFS_ISAPPLEUFS; + /* First check to see if this is tagged as an Apple UFS filesystem + * in the disklabel + */ + if (getdiskinfo(devvp, &dkw) == 0 && + strcmp(dkw.dkw_ptype, DKW_PTYPE_APPLEUFS) == 0) + ump->um_flags |= UFS_ISAPPLEUFS; +#ifdef APPLE_UFS + else { + /* Manually look for an apple ufs label, and if a valid one + * is found, then treat it like an Apple UFS filesystem anyway + * + * EINVAL is most probably a blocksize or alignment problem, + * it is unlikely that this is an Apple UFS filesystem then. + */ + error = bread(devvp, (daddr_t)(APPLEUFS_LABEL_OFFSET / DEV_BSIZE), + APPLEUFS_LABEL_SIZE, cred, 0, &bp); + if (error && error != EINVAL) { + brelse(bp, 0); + return (error); + } + if (error == 0) { + error = ffs_appleufs_validate(fs->fs_fsmnt, + (struct appleufslabel *)bp->b_data, NULL); + if (error == 0) + ump->um_flags |= UFS_ISAPPLEUFS; + } + brelse(bp, 0); + bp = NULL; + } +#else + if (ump->um_flags & UFS_ISAPPLEUFS) + return (EIO); +#endif + + if (UFS_MPISAPPLEUFS(ump)) { + /* see comment about NeXT below */ + ump->um_maxsymlinklen = APPLEUFS_MAXSYMLINKLEN; + ump->um_dirblksiz = APPLEUFS_DIRBLKSIZ; + mp->mnt_iflag |= IMNT_DTYPE; + } else { + ump->um_maxsymlinklen = fs->fs_maxsymlinklen; + ump->um_dirblksiz = DIRBLKSIZ; + if (ump->um_maxsymlinklen > 0) + mp->mnt_iflag |= IMNT_DTYPE; + else + mp->mnt_iflag &= ~IMNT_DTYPE; + } + ffs_oldfscompat_read(fs, ump, sblockloc); + + mutex_enter(&ump->um_lock); + ump->um_maxfilesize = fs->fs_maxfilesize; + if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) { + uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n", + mp->mnt_stat.f_mntonname, fs->fs_flags, + (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting"); + if ((mp->mnt_flag & MNT_FORCE) == 0) { + mutex_exit(&ump->um_lock); + return (EINVAL); + } + } + if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { + fs->fs_pendingblocks = 0; + fs->fs_pendinginodes = 0; + } + mutex_exit(&ump->um_lock); + + ffs_statvfs(mp, &mp->mnt_stat); + /* + * Step 3: re-read summary information from disk. + */ + blks = howmany(fs->fs_cssize, fs->fs_fsize); + space = fs->fs_csp; + for (i = 0; i < blks; i += fs->fs_frag) { + bsize = fs->fs_bsize; + if (i + fs->fs_frag > blks) + bsize = (blks - i) * fs->fs_fsize; + error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), bsize, + NOCRED, 0, &bp); + if (error) { + brelse(bp, 0); + return (error); + } +#ifdef FFS_EI + if (UFS_FSNEEDSWAP(fs)) + ffs_csum_swap((struct csum *)bp->b_data, + (struct csum *)space, bsize); + else +#endif + memcpy(space, bp->b_data, (size_t)bsize); + space = (char *)space + bsize; + brelse(bp, 0); + } + if (fs->fs_snapinum[0] != 0) + ffs_snapshot_mount(mp); + /* + * We no longer know anything about clusters per cylinder group. + */ + if (fs->fs_contigsumsize > 0) { + lp = fs->fs_maxcluster; + for (i = 0; i < fs->fs_ncg; i++) + *lp++ = fs->fs_contigsumsize; + } + + /* Allocate a marker vnode. */ + mvp = vnalloc(mp); + /* + * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() + * and vclean() can be called indirectly + */ + mutex_enter(&mntvnode_lock); + loop: + for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) { + vmark(mvp, vp); + if (vp->v_mount != mp || vismarker(vp)) + continue; + /* + * Step 4: invalidate all inactive vnodes. + */ + if (vrecycle(vp, &mntvnode_lock, l)) { + mutex_enter(&mntvnode_lock); + (void)vunmark(mvp); + goto loop; + } + /* + * Step 5: invalidate all cached file data. + */ + mutex_enter(vp->v_interlock); + mutex_exit(&mntvnode_lock); + if (vget(vp, LK_EXCLUSIVE)) { + (void)vunmark(mvp); + goto loop; + } + if (vinvalbuf(vp, 0, cred, l, 0, 0)) + panic("ffs_reload: dirty2"); + /* + * Step 6: re-read inode data for all active vnodes. + */ + ip = VTOI(vp); + error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->fs_bsize, NOCRED, 0, &bp); + if (error) { + brelse(bp, 0); + vput(vp); + (void)vunmark(mvp); + break; + } + ffs_load_inode(bp, ip, fs, ip->i_number); + brelse(bp, 0); + vput(vp); + mutex_enter(&mntvnode_lock); + } + mutex_exit(&mntvnode_lock); + vnfree(mvp); + return (error); +} + +/* + * Possible superblock locations ordered from most to least likely. + */ +static const int sblock_try[] = SBLOCKSEARCH; + +/* + * Common code for mount and mountroot + */ +int +ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) +{ + struct ufsmount *ump; + struct buf *bp; + struct fs *fs; + dev_t dev; + struct dkwedge_info dkw; + void *space; + daddr_t sblockloc, fsblockloc; + int blks, fstype; + int error, i, bsize, ronly, bset = 0; +#ifdef FFS_EI + int needswap = 0; /* keep gcc happy */ +#endif + int32_t *lp; + kauth_cred_t cred; + u_int32_t sbsize = 8192; /* keep gcc happy*/ + int32_t fsbsize; + + dev = devvp->v_rdev; + cred = l ? l->l_cred : NOCRED; + + /* Flush out any old buffers remaining from a previous use. */ + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0); + VOP_UNLOCK(devvp); + if (error) + return (error); + + ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + + bp = NULL; + ump = NULL; + fs = NULL; + sblockloc = 0; + fstype = 0; + + error = fstrans_mount(mp); + if (error) + return error; + + ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK); + memset(ump, 0, sizeof *ump); + mutex_init(&ump->um_lock, MUTEX_DEFAULT, IPL_NONE); + error = ffs_snapshot_init(ump); + if (error) + goto out; + ump->um_ops = &ffs_ufsops; + +#ifdef WAPBL + sbagain: +#endif + /* + * Try reading the superblock in each of its possible locations. + */ + for (i = 0; ; i++) { + if (bp != NULL) { + brelse(bp, BC_NOCACHE); + bp = NULL; + } + if (sblock_try[i] == -1) { + error = EINVAL; + fs = NULL; + goto out; + } + error = bread(devvp, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE, cred, + 0, &bp); + if (error) { + fs = NULL; + goto out; + } + fs = (struct fs*)bp->b_data; + fsblockloc = sblockloc = sblock_try[i]; + if (fs->fs_magic == FS_UFS1_MAGIC) { + sbsize = fs->fs_sbsize; + fstype = UFS1; + fsbsize = fs->fs_bsize; +#ifdef FFS_EI + needswap = 0; + } else if (fs->fs_magic == bswap32(FS_UFS1_MAGIC)) { + sbsize = bswap32(fs->fs_sbsize); + fstype = UFS1; + fsbsize = bswap32(fs->fs_bsize); + needswap = 1; +#endif + } else if (fs->fs_magic == FS_UFS2_MAGIC) { + sbsize = fs->fs_sbsize; + fstype = UFS2; + fsbsize = fs->fs_bsize; +#ifdef FFS_EI + needswap = 0; + } else if (fs->fs_magic == bswap32(FS_UFS2_MAGIC)) { + sbsize = bswap32(fs->fs_sbsize); + fstype = UFS2; + fsbsize = bswap32(fs->fs_bsize); + needswap = 1; +#endif + } else + continue; + + + /* fs->fs_sblockloc isn't defined for old filesystems */ + if (fstype == UFS1 && !(fs->fs_old_flags & FS_FLAGS_UPDATED)) { + if (sblockloc == SBLOCK_UFS2) + /* + * This is likely to be the first alternate + * in a filesystem with 64k blocks. + * Don't use it. + */ + continue; + fsblockloc = sblockloc; + } else { + fsblockloc = fs->fs_sblockloc; +#ifdef FFS_EI + if (needswap) + fsblockloc = bswap64(fsblockloc); +#endif + } + + /* Check we haven't found an alternate superblock */ + if (fsblockloc != sblockloc) + continue; + + /* Validate size of superblock */ + if (sbsize > MAXBSIZE || sbsize < sizeof(struct fs)) + continue; + + /* Check that we can handle the file system blocksize */ + if (fsbsize > MAXBSIZE) { + printf("ffs_mountfs: block size (%d) > MAXBSIZE (%d)\n", + fsbsize, MAXBSIZE); + continue; + } + + /* Ok seems to be a good superblock */ + break; + } + + fs = malloc((u_long)sbsize, M_UFSMNT, M_WAITOK); + memcpy(fs, bp->b_data, sbsize); + ump->um_fs = fs; + +#ifdef FFS_EI + if (needswap) { + ffs_sb_swap((struct fs*)bp->b_data, fs); + fs->fs_flags |= FS_SWAPPED; + } else +#endif + fs->fs_flags &= ~FS_SWAPPED; + +#ifdef WAPBL + if ((mp->mnt_wapbl_replay == 0) && (fs->fs_flags & FS_DOWAPBL)) { + error = ffs_wapbl_replay_start(mp, fs, devvp); + if (error && (mp->mnt_flag & MNT_FORCE) == 0) + goto out; + if (!error) { + if (!ronly) { + /* XXX fsmnt may be stale. */ + printf("%s: replaying log to disk\n", + fs->fs_fsmnt); + error = wapbl_replay_write(mp->mnt_wapbl_replay, + devvp); + if (error) + goto out; + wapbl_replay_stop(mp->mnt_wapbl_replay); + fs->fs_clean = FS_WASCLEAN; + } else { + /* XXX fsmnt may be stale */ + printf("%s: replaying log to memory\n", + fs->fs_fsmnt); + } + + /* Force a re-read of the superblock */ + brelse(bp, BC_INVAL); + bp = NULL; + free(fs, M_UFSMNT); + fs = NULL; + goto sbagain; + } + } +#else /* !WAPBL */ + if ((fs->fs_flags & FS_DOWAPBL) && (mp->mnt_flag & MNT_FORCE) == 0) { + error = EPERM; + goto out; + } +#endif /* !WAPBL */ + + ffs_oldfscompat_read(fs, ump, sblockloc); + ump->um_maxfilesize = fs->fs_maxfilesize; + + if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) { + uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n", + mp->mnt_stat.f_mntonname, fs->fs_flags, + (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting"); + if ((mp->mnt_flag & MNT_FORCE) == 0) { + error = EINVAL; + goto out; + } + } + + if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { + fs->fs_pendingblocks = 0; + fs->fs_pendinginodes = 0; + } + + ump->um_fstype = fstype; + if (fs->fs_sbsize < SBLOCKSIZE) + brelse(bp, BC_INVAL); + else + brelse(bp, 0); + bp = NULL; + + /* First check to see if this is tagged as an Apple UFS filesystem + * in the disklabel + */ + if (getdiskinfo(devvp, &dkw) == 0 && + strcmp(dkw.dkw_ptype, DKW_PTYPE_APPLEUFS) == 0) + ump->um_flags |= UFS_ISAPPLEUFS; +#ifdef APPLE_UFS + else { + /* Manually look for an apple ufs label, and if a valid one + * is found, then treat it like an Apple UFS filesystem anyway + */ + error = bread(devvp, (daddr_t)(APPLEUFS_LABEL_OFFSET / DEV_BSIZE), + APPLEUFS_LABEL_SIZE, cred, 0, &bp); + if (error) + goto out; + error = ffs_appleufs_validate(fs->fs_fsmnt, + (struct appleufslabel *)bp->b_data, NULL); + if (error == 0) { + ump->um_flags |= UFS_ISAPPLEUFS; + } + brelse(bp, 0); + bp = NULL; + } +#else + if (ump->um_flags & UFS_ISAPPLEUFS) { + error = EINVAL; + goto out; + } +#endif + +#if 0 +/* + * XXX This code changes the behaviour of mounting dirty filesystems, to + * XXX require "mount -f ..." to mount them. This doesn't match what + * XXX mount(8) describes and is disabled for now. + */ + /* + * If the file system is not clean, don't allow it to be mounted + * unless MNT_FORCE is specified. (Note: MNT_FORCE is always set + * for the root file system.) + */ + if (fs->fs_flags & FS_DOWAPBL) { + /* + * wapbl normally expects to be FS_WASCLEAN when the FS_DOWAPBL + * bit is set, although there's a window in unmount where it + * could be FS_ISCLEAN + */ + if ((mp->mnt_flag & MNT_FORCE) == 0 && + (fs->fs_clean & (FS_WASCLEAN | FS_ISCLEAN)) == 0) { + error = EPERM; + goto out; + } + } else + if ((fs->fs_clean & FS_ISCLEAN) == 0 && + (mp->mnt_flag & MNT_FORCE) == 0) { + error = EPERM; + goto out; + } +#endif + + /* + * verify that we can access the last block in the fs + * if we're mounting read/write. + */ + + if (!ronly) { + error = bread(devvp, fsbtodb(fs, fs->fs_size - 1), fs->fs_fsize, + cred, 0, &bp); + if (bp->b_bcount != fs->fs_fsize) + error = EINVAL; + if (error) { + bset = BC_INVAL; + goto out; + } + brelse(bp, BC_INVAL); + bp = NULL; + } + + fs->fs_ronly = ronly; + /* Don't bump fs_clean if we're replaying journal */ + if (!((fs->fs_flags & FS_DOWAPBL) && (fs->fs_clean & FS_WASCLEAN))) + if (ronly == 0) { + fs->fs_clean <<= 1; + fs->fs_fmod = 1; + } + bsize = fs->fs_cssize; + blks = howmany(bsize, fs->fs_fsize); + if (fs->fs_contigsumsize > 0) + bsize += fs->fs_ncg * sizeof(int32_t); + bsize += fs->fs_ncg * sizeof(*fs->fs_contigdirs); + space = malloc((u_long)bsize, M_UFSMNT, M_WAITOK); + fs->fs_csp = space; + for (i = 0; i < blks; i += fs->fs_frag) { + bsize = fs->fs_bsize; + if (i + fs->fs_frag > blks) + bsize = (blks - i) * fs->fs_fsize; + error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), bsize, + cred, 0, &bp); + if (error) { + free(fs->fs_csp, M_UFSMNT); + goto out; + } +#ifdef FFS_EI + if (needswap) + ffs_csum_swap((struct csum *)bp->b_data, + (struct csum *)space, bsize); + else +#endif + memcpy(space, bp->b_data, (u_int)bsize); + + space = (char *)space + bsize; + brelse(bp, 0); + bp = NULL; + } + if (fs->fs_contigsumsize > 0) { + fs->fs_maxcluster = lp = space; + for (i = 0; i < fs->fs_ncg; i++) + *lp++ = fs->fs_contigsumsize; + space = lp; + } + bsize = fs->fs_ncg * sizeof(*fs->fs_contigdirs); + fs->fs_contigdirs = space; + space = (char *)space + bsize; + memset(fs->fs_contigdirs, 0, bsize); + /* Compatibility for old filesystems - XXX */ + if (fs->fs_avgfilesize <= 0) + fs->fs_avgfilesize = AVFILESIZ; + if (fs->fs_avgfpdir <= 0) + fs->fs_avgfpdir = AFPDIR; + fs->fs_active = NULL; + mp->mnt_data = ump; + mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev; + mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_FFS); + mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; + mp->mnt_stat.f_namemax = FFS_MAXNAMLEN; + if (UFS_MPISAPPLEUFS(ump)) { + /* NeXT used to keep short symlinks in the inode even + * when using FS_42INODEFMT. In that case fs->fs_maxsymlinklen + * is probably -1, but we still need to be able to identify + * short symlinks. + */ + ump->um_maxsymlinklen = APPLEUFS_MAXSYMLINKLEN; + ump->um_dirblksiz = APPLEUFS_DIRBLKSIZ; + mp->mnt_iflag |= IMNT_DTYPE; + } else { + ump->um_maxsymlinklen = fs->fs_maxsymlinklen; + ump->um_dirblksiz = DIRBLKSIZ; + if (ump->um_maxsymlinklen > 0) + mp->mnt_iflag |= IMNT_DTYPE; + else + mp->mnt_iflag &= ~IMNT_DTYPE; + } + mp->mnt_fs_bshift = fs->fs_bshift; + mp->mnt_dev_bshift = DEV_BSHIFT; /* XXX */ + mp->mnt_flag |= MNT_LOCAL; + mp->mnt_iflag |= IMNT_MPSAFE; +#ifdef FFS_EI + if (needswap) + ump->um_flags |= UFS_NEEDSWAP; +#endif + ump->um_mountp = mp; + ump->um_dev = dev; + ump->um_devvp = devvp; + ump->um_nindir = fs->fs_nindir; + ump->um_lognindir = ffs(fs->fs_nindir) - 1; + ump->um_bptrtodb = fs->fs_fshift - DEV_BSHIFT; + ump->um_seqinc = fs->fs_frag; + for (i = 0; i < MAXQUOTAS; i++) + ump->um_quotas[i] = NULLVP; + devvp->v_specmountpoint = mp; + if (ronly == 0 && fs->fs_snapinum[0] != 0) + ffs_snapshot_mount(mp); +#ifdef WAPBL + if (!ronly) { + KDASSERT(fs->fs_ronly == 0); + /* + * ffs_wapbl_start() needs mp->mnt_stat initialised if it + * needs to create a new log file in-filesystem. + */ + ffs_statvfs(mp, &mp->mnt_stat); + + error = ffs_wapbl_start(mp); + if (error) { + free(fs->fs_csp, M_UFSMNT); + goto out; + } + } +#endif /* WAPBL */ + if (ronly == 0) { +#ifdef QUOTA2 + error = ffs_quota2_mount(mp); + if (error) { + free(fs->fs_csp, M_UFSMNT); + goto out; + } +#else + if (fs->fs_flags & FS_DOQUOTA2) { + ump->um_flags |= UFS_QUOTA2; + uprintf("%s: options QUOTA2 not enabled%s\n", + mp->mnt_stat.f_mntonname, + (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting"); + if ((mp->mnt_flag & MNT_FORCE) == 0) { + error = EINVAL; + free(fs->fs_csp, M_UFSMNT); + goto out; + } + } +#endif + } +#ifdef UFS_EXTATTR + /* + * Initialize file-backed extended attributes on UFS1 file + * systems. + */ + if (ump->um_fstype == UFS1) + ufs_extattr_uepm_init(&ump->um_extattr); +#endif /* UFS_EXTATTR */ + + return (0); +out: +#ifdef WAPBL + if (mp->mnt_wapbl_replay) { + wapbl_replay_stop(mp->mnt_wapbl_replay); + wapbl_replay_free(mp->mnt_wapbl_replay); + mp->mnt_wapbl_replay = 0; + } +#endif + + fstrans_unmount(mp); + if (fs) + free(fs, M_UFSMNT); + devvp->v_specmountpoint = NULL; + if (bp) + brelse(bp, bset); + if (ump) { + if (ump->um_oldfscompat) + free(ump->um_oldfscompat, M_UFSMNT); + mutex_destroy(&ump->um_lock); + free(ump, M_UFSMNT); + mp->mnt_data = NULL; + } + return (error); +} + +/* + * Sanity checks for loading old filesystem superblocks. + * See ffs_oldfscompat_write below for unwound actions. + * + * XXX - Parts get retired eventually. + * Unfortunately new bits get added. + */ +static void +ffs_oldfscompat_read(struct fs *fs, struct ufsmount *ump, daddr_t sblockloc) +{ + off_t maxfilesize; + int32_t *extrasave; + + if ((fs->fs_magic != FS_UFS1_MAGIC) || + (fs->fs_old_flags & FS_FLAGS_UPDATED)) + return; + + if (!ump->um_oldfscompat) + ump->um_oldfscompat = malloc(512 + 3*sizeof(int32_t), + M_UFSMNT, M_WAITOK); + + memcpy(ump->um_oldfscompat, &fs->fs_old_postbl_start, 512); + extrasave = ump->um_oldfscompat; + extrasave += 512/sizeof(int32_t); + extrasave[0] = fs->fs_old_npsect; + extrasave[1] = fs->fs_old_interleave; + extrasave[2] = fs->fs_old_trackskew; + + /* These fields will be overwritten by their + * original values in fs_oldfscompat_write, so it is harmless + * to modify them here. + */ + fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; + fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; + fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; + fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; + + fs->fs_maxbsize = fs->fs_bsize; + fs->fs_time = fs->fs_old_time; + fs->fs_size = fs->fs_old_size; + fs->fs_dsize = fs->fs_old_dsize; + fs->fs_csaddr = fs->fs_old_csaddr; + fs->fs_sblockloc = sblockloc; + + fs->fs_flags = fs->fs_old_flags | (fs->fs_flags & FS_INTERNAL); + + if (fs->fs_old_postblformat == FS_42POSTBLFMT) { + fs->fs_old_nrpos = 8; + fs->fs_old_npsect = fs->fs_old_nsect; + fs->fs_old_interleave = 1; + fs->fs_old_trackskew = 0; + } + + if (fs->fs_old_inodefmt < FS_44INODEFMT) { + fs->fs_maxfilesize = (u_quad_t) 1LL << 39; + fs->fs_qbmask = ~fs->fs_bmask; + fs->fs_qfmask = ~fs->fs_fmask; + } + + maxfilesize = (u_int64_t)0x80000000 * fs->fs_bsize - 1; + if (fs->fs_maxfilesize > maxfilesize) + fs->fs_maxfilesize = maxfilesize; + + /* Compatibility for old filesystems */ + if (fs->fs_avgfilesize <= 0) + fs->fs_avgfilesize = AVFILESIZ; + if (fs->fs_avgfpdir <= 0) + fs->fs_avgfpdir = AFPDIR; + +#if 0 + if (bigcgs) { + fs->fs_save_cgsize = fs->fs_cgsize; + fs->fs_cgsize = fs->fs_bsize; + } +#endif +} + +/* + * Unwinding superblock updates for old filesystems. + * See ffs_oldfscompat_read above for details. + * + * XXX - Parts get retired eventually. + * Unfortunately new bits get added. + */ +static void +ffs_oldfscompat_write(struct fs *fs, struct ufsmount *ump) +{ + int32_t *extrasave; + + if ((fs->fs_magic != FS_UFS1_MAGIC) || + (fs->fs_old_flags & FS_FLAGS_UPDATED)) + return; + + fs->fs_old_time = fs->fs_time; + fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; + fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; + fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; + fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; + fs->fs_old_flags = fs->fs_flags; + +#if 0 + if (bigcgs) { + fs->fs_cgsize = fs->fs_save_cgsize; + } +#endif + + memcpy(&fs->fs_old_postbl_start, ump->um_oldfscompat, 512); + extrasave = ump->um_oldfscompat; + extrasave += 512/sizeof(int32_t); + fs->fs_old_npsect = extrasave[0]; + fs->fs_old_interleave = extrasave[1]; + fs->fs_old_trackskew = extrasave[2]; + +} + +/* + * unmount vfs operation + */ +int +ffs_unmount(struct mount *mp, int mntflags) +{ + struct lwp *l = curlwp; + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + int error, flags; +#ifdef WAPBL + extern int doforce; +#endif + + flags = 0; + if (mntflags & MNT_FORCE) + flags |= FORCECLOSE; + if ((error = ffs_flushfiles(mp, flags, l)) != 0) + return (error); + error = UFS_WAPBL_BEGIN(mp); + if (error == 0) + if (fs->fs_ronly == 0 && + ffs_cgupdate(ump, MNT_WAIT) == 0 && + fs->fs_clean & FS_WASCLEAN) { + fs->fs_clean = FS_ISCLEAN; + fs->fs_fmod = 0; + (void) ffs_sbupdate(ump, MNT_WAIT); + } + if (error == 0) + UFS_WAPBL_END(mp); +#ifdef WAPBL + KASSERT(!(mp->mnt_wapbl_replay && mp->mnt_wapbl)); + if (mp->mnt_wapbl_replay) { + KDASSERT(fs->fs_ronly); + wapbl_replay_stop(mp->mnt_wapbl_replay); + wapbl_replay_free(mp->mnt_wapbl_replay); + mp->mnt_wapbl_replay = 0; + } + error = ffs_wapbl_stop(mp, doforce && (mntflags & MNT_FORCE)); + if (error) { + return error; + } +#endif /* WAPBL */ +#ifdef UFS_EXTATTR + if (ump->um_fstype == UFS1) { + ufs_extattr_stop(mp, l); + ufs_extattr_uepm_destroy(&ump->um_extattr); + } +#endif /* UFS_EXTATTR */ + + if (ump->um_devvp->v_type != VBAD) + ump->um_devvp->v_specmountpoint = NULL; + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); + (void)VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD | FWRITE, + NOCRED); + vput(ump->um_devvp); + free(fs->fs_csp, M_UFSMNT); + free(fs, M_UFSMNT); + if (ump->um_oldfscompat != NULL) + free(ump->um_oldfscompat, M_UFSMNT); + mutex_destroy(&ump->um_lock); + ffs_snapshot_fini(ump); + free(ump, M_UFSMNT); + mp->mnt_data = NULL; + mp->mnt_flag &= ~MNT_LOCAL; + fstrans_unmount(mp); + return (0); +} + +/* + * Flush out all the files in a filesystem. + */ +int +ffs_flushfiles(struct mount *mp, int flags, struct lwp *l) +{ + extern int doforce; + struct ufsmount *ump; + int error; + + if (!doforce) + flags &= ~FORCECLOSE; + ump = VFSTOUFS(mp); +#ifdef QUOTA + if ((error = quota1_umount(mp, flags)) != 0) + return (error); +#endif +#ifdef QUOTA2 + if ((error = quota2_umount(mp, flags)) != 0) + return (error); +#endif + if ((error = vflush(mp, 0, SKIPSYSTEM | flags)) != 0) + return (error); + ffs_snapshot_unmount(mp); + /* + * Flush all the files. + */ + error = vflush(mp, NULLVP, flags); + if (error) + return (error); + /* + * Flush filesystem metadata. + */ + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_FSYNC(ump->um_devvp, l->l_cred, FSYNC_WAIT, 0, 0); + VOP_UNLOCK(ump->um_devvp); + if (flags & FORCECLOSE) /* XXXDBJ */ + error = 0; + +#ifdef WAPBL + if (error) + return error; + if (mp->mnt_wapbl) { + error = wapbl_flush(mp->mnt_wapbl, 1); + if (flags & FORCECLOSE) + error = 0; + } +#endif + + return (error); +} + +/* + * Get file system statistics. + */ +int +ffs_statvfs(struct mount *mp, struct statvfs *sbp) +{ + struct ufsmount *ump; + struct fs *fs; + + ump = VFSTOUFS(mp); + fs = ump->um_fs; + mutex_enter(&ump->um_lock); + sbp->f_bsize = fs->fs_bsize; + sbp->f_frsize = fs->fs_fsize; + sbp->f_iosize = fs->fs_bsize; + sbp->f_blocks = fs->fs_dsize; + sbp->f_bfree = blkstofrags(fs, fs->fs_cstotal.cs_nbfree) + + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); + sbp->f_bresvd = ((u_int64_t) fs->fs_dsize * (u_int64_t) + fs->fs_minfree) / (u_int64_t) 100; + if (sbp->f_bfree > sbp->f_bresvd) + sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd; + else + sbp->f_bavail = 0; + sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; + sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; + sbp->f_favail = sbp->f_ffree; + sbp->f_fresvd = 0; + mutex_exit(&ump->um_lock); + copy_statvfs_info(sbp, mp); + + return (0); +} + +/* + * Go through the disk queues to initiate sandbagged IO; + * go through the inodes to write those that have been modified; + * initiate the writing of the super block if it has been modified. + * + * Note: we are always called with the filesystem marked `MPBUSY'. + */ +int +ffs_sync(struct mount *mp, int waitfor, kauth_cred_t cred) +{ + struct vnode *vp, *mvp, *nvp; + struct inode *ip; + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs; + int error, allerror = 0; + bool is_suspending; + + fs = ump->um_fs; + if (fs->fs_fmod != 0 && fs->fs_ronly != 0) { /* XXX */ + printf("fs = %s\n", fs->fs_fsmnt); + panic("update: rofs mod"); + } + + /* Allocate a marker vnode. */ + mvp = vnalloc(mp); + + fstrans_start(mp, FSTRANS_SHARED); + is_suspending = (fstrans_getstate(mp) == FSTRANS_SUSPENDING); + /* + * Write back each (modified) inode. + */ + mutex_enter(&mntvnode_lock); +loop: + /* + * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() + * and vclean() can be called indirectly + */ + for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { + nvp = TAILQ_NEXT(vp, v_mntvnodes); + /* + * If the vnode that we are about to sync is no longer + * associated with this mount point, start over. + */ + if (vp->v_mount != mp) + goto loop; + /* + * Don't interfere with concurrent scans of this FS. + */ + if (vismarker(vp)) + continue; + mutex_enter(vp->v_interlock); + ip = VTOI(vp); + + /* + * Skip the vnode/inode if inaccessible. + */ + if (ip == NULL || (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0 || + vp->v_type == VNON) { + mutex_exit(vp->v_interlock); + continue; + } + + /* + * We deliberately update inode times here. This will + * prevent a massive queue of updates accumulating, only + * to be handled by a call to unmount. + * + * XXX It would be better to have the syncer trickle these + * out. Adjustment needed to allow registering vnodes for + * sync when the vnode is clean, but the inode dirty. Or + * have ufs itself trickle out inode updates. + * + * If doing a lazy sync, we don't care about metadata or + * data updates, because they are handled by each vnode's + * synclist entry. In this case we are only interested in + * writing back modified inodes. + */ + if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | + IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) == 0 && + (waitfor == MNT_LAZY || (LIST_EMPTY(&vp->v_dirtyblkhd) && + UVM_OBJ_IS_CLEAN(&vp->v_uobj)))) { + mutex_exit(vp->v_interlock); + continue; + } + if (vp->v_type == VBLK && is_suspending) { + mutex_exit(vp->v_interlock); + continue; + } + vmark(mvp, vp); + mutex_exit(&mntvnode_lock); + error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT); + if (error) { + mutex_enter(&mntvnode_lock); + nvp = vunmark(mvp); + if (error == ENOENT) { + goto loop; + } + continue; + } + if (waitfor == MNT_LAZY) { + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (!error) { + error = ffs_update(vp, NULL, NULL, + UPDATE_CLOSE); + UFS_WAPBL_END(vp->v_mount); + } + } else { + error = VOP_FSYNC(vp, cred, FSYNC_NOLOG | + (waitfor == MNT_WAIT ? FSYNC_WAIT : 0), 0, 0); + } + if (error) + allerror = error; + vput(vp); + mutex_enter(&mntvnode_lock); + nvp = vunmark(mvp); + } + mutex_exit(&mntvnode_lock); + /* + * Force stale file system control information to be flushed. + */ + if (waitfor != MNT_LAZY && (ump->um_devvp->v_numoutput > 0 || + !LIST_EMPTY(&ump->um_devvp->v_dirtyblkhd))) { + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); + if ((error = VOP_FSYNC(ump->um_devvp, cred, + (waitfor == MNT_WAIT ? FSYNC_WAIT : 0) | FSYNC_NOLOG, + 0, 0)) != 0) + allerror = error; + VOP_UNLOCK(ump->um_devvp); + if (allerror == 0 && waitfor == MNT_WAIT && !mp->mnt_wapbl) { + mutex_enter(&mntvnode_lock); + goto loop; + } + } +#if defined(QUOTA) || defined(QUOTA2) + qsync(mp); +#endif + /* + * Write back modified superblock. + */ + if (fs->fs_fmod != 0) { + fs->fs_fmod = 0; + fs->fs_time = time_second; + error = UFS_WAPBL_BEGIN(mp); + if (error) + allerror = error; + else { + if ((error = ffs_cgupdate(ump, waitfor))) + allerror = error; + UFS_WAPBL_END(mp); + } + } + +#ifdef WAPBL + if (mp->mnt_wapbl) { + error = wapbl_flush(mp->mnt_wapbl, 0); + if (error) + allerror = error; + } +#endif + + fstrans_done(mp); + vnfree(mvp); + return (allerror); +} + +/* + * Look up a FFS dinode number to find its incore vnode, otherwise read it + * in from disk. If it is in core, wait for the lock bit to clear, then + * return the inode locked. Detection and handling of mount points must be + * done by the calling routine. + */ +int +ffs_vget(struct mount *mp, ino_t ino, struct vnode **vpp) +{ + struct fs *fs; + struct inode *ip; + struct ufsmount *ump; + struct buf *bp; + struct vnode *vp; + dev_t dev; + int error; + + ump = VFSTOUFS(mp); + dev = ump->um_dev; + + retry: + if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL) + return (0); + + /* Allocate a new vnode/inode. */ + error = getnewvnode(VT_UFS, mp, ffs_vnodeop_p, NULL, &vp); + if (error) { + *vpp = NULL; + return (error); + } + ip = pool_cache_get(ffs_inode_cache, PR_WAITOK); + + /* + * If someone beat us to it, put back the freshly allocated + * vnode/inode pair and retry. + */ + mutex_enter(&ufs_hashlock); + if (ufs_ihashget(dev, ino, 0) != NULL) { + mutex_exit(&ufs_hashlock); + ungetnewvnode(vp); + pool_cache_put(ffs_inode_cache, ip); + goto retry; + } + + vp->v_vflag |= VV_LOCKSWORK; + + /* + * XXX MFS ends up here, too, to allocate an inode. Should we + * XXX create another pool for MFS inodes? + */ + + memset(ip, 0, sizeof(struct inode)); + vp->v_data = ip; + ip->i_vnode = vp; + ip->i_ump = ump; + ip->i_fs = fs = ump->um_fs; + ip->i_dev = dev; + ip->i_number = ino; +#if defined(QUOTA) || defined(QUOTA2) + ufsquota_init(ip); +#endif + + /* + * Initialize genfs node, we might proceed to destroy it in + * error branches. + */ + genfs_node_init(vp, &ffs_genfsops); + + /* + * Put it onto its hash chain and lock it so that other requests for + * this inode will block if they arrive while we are sleeping waiting + * for old data structures to be purged or for the contents of the + * disk portion of this inode to be read. + */ + + ufs_ihashins(ip); + mutex_exit(&ufs_hashlock); + + /* Read in the disk contents for the inode, copy into the inode. */ + error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), + (int)fs->fs_bsize, NOCRED, 0, &bp); + if (error) { + + /* + * The inode does not contain anything useful, so it would + * be misleading to leave it on its hash chain. With mode + * still zero, it will be unlinked and returned to the free + * list by vput(). + */ + + vput(vp); + brelse(bp, 0); + *vpp = NULL; + return (error); + } + if (ip->i_ump->um_fstype == UFS1) + ip->i_din.ffs1_din = pool_cache_get(ffs_dinode1_cache, + PR_WAITOK); + else + ip->i_din.ffs2_din = pool_cache_get(ffs_dinode2_cache, + PR_WAITOK); + ffs_load_inode(bp, ip, fs, ino); + brelse(bp, 0); + + /* + * Initialize the vnode from the inode, check for aliases. + * Note that the underlying vnode may have changed. + */ + + ufs_vinit(mp, ffs_specop_p, ffs_fifoop_p, &vp); + + /* + * Finish inode initialization now that aliasing has been resolved. + */ + + ip->i_devvp = ump->um_devvp; + vref(ip->i_devvp); + + /* + * Ensure that uid and gid are correct. This is a temporary + * fix until fsck has been changed to do the update. + */ + + if (fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */ + ip->i_uid = ip->i_ffs1_ouid; /* XXX */ + ip->i_gid = ip->i_ffs1_ogid; /* XXX */ + } /* XXX */ + uvm_vnp_setsize(vp, ip->i_size); + *vpp = vp; + return (0); +} + +/* + * File handle to vnode + * + * Have to be really careful about stale file handles: + * - check that the inode number is valid + * - call ffs_vget() to get the locked inode + * - check for an unallocated inode (i_mode == 0) + * - check that the given client host has export rights and return + * those rights via. exflagsp and credanonp + */ +int +ffs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp) +{ + struct ufid ufh; + struct fs *fs; + + if (fhp->fid_len != sizeof(struct ufid)) + return EINVAL; + + memcpy(&ufh, fhp, sizeof(ufh)); + fs = VFSTOUFS(mp)->um_fs; + if (ufh.ufid_ino < ROOTINO || + ufh.ufid_ino >= fs->fs_ncg * fs->fs_ipg) + return (ESTALE); + return (ufs_fhtovp(mp, &ufh, vpp)); +} + +/* + * Vnode pointer to File handle + */ +/* ARGSUSED */ +int +ffs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size) +{ + struct inode *ip; + struct ufid ufh; + + if (*fh_size < sizeof(struct ufid)) { + *fh_size = sizeof(struct ufid); + return E2BIG; + } + ip = VTOI(vp); + *fh_size = sizeof(struct ufid); + memset(&ufh, 0, sizeof(ufh)); + ufh.ufid_len = sizeof(struct ufid); + ufh.ufid_ino = ip->i_number; + ufh.ufid_gen = ip->i_gen; + memcpy(fhp, &ufh, sizeof(ufh)); + return (0); +} + +void +ffs_init(void) +{ + if (ffs_initcount++ > 0) + return; + + ffs_inode_cache = pool_cache_init(sizeof(struct inode), 0, 0, 0, + "ffsino", NULL, IPL_NONE, NULL, NULL, NULL); + ffs_dinode1_cache = pool_cache_init(sizeof(struct ufs1_dinode), 0, 0, 0, + "ffsdino1", NULL, IPL_NONE, NULL, NULL, NULL); + ffs_dinode2_cache = pool_cache_init(sizeof(struct ufs2_dinode), 0, 0, 0, + "ffsdino2", NULL, IPL_NONE, NULL, NULL, NULL); + ufs_init(); +} + +void +ffs_reinit(void) +{ + + ufs_reinit(); +} + +void +ffs_done(void) +{ + if (--ffs_initcount > 0) + return; + + ufs_done(); + pool_cache_destroy(ffs_dinode2_cache); + pool_cache_destroy(ffs_dinode1_cache); + pool_cache_destroy(ffs_inode_cache); +} + +/* + * Write a superblock and associated information back to disk. + */ +int +ffs_sbupdate(struct ufsmount *mp, int waitfor) +{ + struct fs *fs = mp->um_fs; + struct buf *bp; + int error = 0; + u_int32_t saveflag; + + error = ffs_getblk(mp->um_devvp, + fs->fs_sblockloc / DEV_BSIZE, FFS_NOBLK, + fs->fs_sbsize, false, &bp); + if (error) + return error; + saveflag = fs->fs_flags & FS_INTERNAL; + fs->fs_flags &= ~FS_INTERNAL; + + memcpy(bp->b_data, fs, fs->fs_sbsize); + + ffs_oldfscompat_write((struct fs *)bp->b_data, mp); +#ifdef FFS_EI + if (mp->um_flags & UFS_NEEDSWAP) + ffs_sb_swap((struct fs *)bp->b_data, (struct fs *)bp->b_data); +#endif + fs->fs_flags |= saveflag; + + if (waitfor == MNT_WAIT) + error = bwrite(bp); + else + bawrite(bp); + return (error); +} + +int +ffs_cgupdate(struct ufsmount *mp, int waitfor) +{ + struct fs *fs = mp->um_fs; + struct buf *bp; + int blks; + void *space; + int i, size, error = 0, allerror = 0; + + allerror = ffs_sbupdate(mp, waitfor); + blks = howmany(fs->fs_cssize, fs->fs_fsize); + space = fs->fs_csp; + for (i = 0; i < blks; i += fs->fs_frag) { + size = fs->fs_bsize; + if (i + fs->fs_frag > blks) + size = (blks - i) * fs->fs_fsize; + error = ffs_getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), + FFS_NOBLK, size, false, &bp); + if (error) + break; +#ifdef FFS_EI + if (mp->um_flags & UFS_NEEDSWAP) + ffs_csum_swap((struct csum*)space, + (struct csum*)bp->b_data, size); + else +#endif + memcpy(bp->b_data, space, (u_int)size); + space = (char *)space + size; + if (waitfor == MNT_WAIT) + error = bwrite(bp); + else + bawrite(bp); + } + if (!allerror && error) + allerror = error; + return (allerror); +} + +int +ffs_extattrctl(struct mount *mp, int cmd, struct vnode *vp, + int attrnamespace, const char *attrname) +{ +#ifdef UFS_EXTATTR + /* + * File-backed extended attributes are only supported on UFS1. + * UFS2 has native extended attributes. + */ + if (VFSTOUFS(mp)->um_fstype == UFS1) + return (ufs_extattrctl(mp, cmd, vp, attrnamespace, attrname)); +#endif + return (vfs_stdextattrctl(mp, cmd, vp, attrnamespace, attrname)); +} + +int +ffs_suspendctl(struct mount *mp, int cmd) +{ + int error; + struct lwp *l = curlwp; + + switch (cmd) { + case SUSPEND_SUSPEND: + if ((error = fstrans_setstate(mp, FSTRANS_SUSPENDING)) != 0) + return error; + error = ffs_sync(mp, MNT_WAIT, l->l_proc->p_cred); + if (error == 0) + error = fstrans_setstate(mp, FSTRANS_SUSPENDED); +#ifdef WAPBL + if (error == 0 && mp->mnt_wapbl) + error = wapbl_flush(mp->mnt_wapbl, 1); +#endif + if (error != 0) { + (void) fstrans_setstate(mp, FSTRANS_NORMAL); + return error; + } + return 0; + + case SUSPEND_RESUME: + return fstrans_setstate(mp, FSTRANS_NORMAL); + + default: + return EINVAL; + } +} + +/* + * Synch vnode for a mounted file system. + */ +static int +ffs_vfs_fsync(vnode_t *vp, int flags) +{ + int error, i, pflags; +#ifdef WAPBL + struct mount *mp; +#endif + + KASSERT(vp->v_type == VBLK); + KASSERT(vp->v_specmountpoint != NULL); + + /* + * Flush all dirty data associated with the vnode. + */ + pflags = PGO_ALLPAGES | PGO_CLEANIT; + if ((flags & FSYNC_WAIT) != 0) + pflags |= PGO_SYNCIO; + mutex_enter(vp->v_interlock); + error = VOP_PUTPAGES(vp, 0, 0, pflags); + if (error) + return error; + +#ifdef WAPBL + mp = vp->v_specmountpoint; + if (mp && mp->mnt_wapbl) { + /* + * Don't bother writing out metadata if the syncer is + * making the request. We will let the sync vnode + * write it out in a single burst through a call to + * VFS_SYNC(). + */ + if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY | FSYNC_NOLOG)) != 0) + return 0; + + /* + * Don't flush the log if the vnode being flushed + * contains no dirty buffers that could be in the log. + */ + if (!LIST_EMPTY(&vp->v_dirtyblkhd)) { + error = wapbl_flush(mp->mnt_wapbl, 0); + if (error) + return error; + } + + if ((flags & FSYNC_WAIT) != 0) { + mutex_enter(vp->v_interlock); + while (vp->v_numoutput) + cv_wait(&vp->v_cv, vp->v_interlock); + mutex_exit(vp->v_interlock); + } + + return 0; + } +#endif /* WAPBL */ + + error = vflushbuf(vp, (flags & FSYNC_WAIT) != 0); + if (error == 0 && (flags & FSYNC_CACHE) != 0) { + i = 1; + (void)VOP_IOCTL(vp, DIOCCACHESYNC, &i, FWRITE, + kauth_cred_get()); + } + + return error; +} diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c new file mode 100644 index 000000000..9acc0bdce --- /dev/null +++ b/sys/ufs/ffs/ffs_vnops.c @@ -0,0 +1,785 @@ +/* $NetBSD: ffs_vnops.c,v 1.120 2011/06/27 16:34:47 manu Exp $ */ + +/*- + * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc, and by Andrew Doran. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.120 2011/06/27 16:34:47 manu Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_ffs.h" +#include "opt_wapbl.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include + +/* Global vfs data structures for ufs. */ +int (**ffs_vnodeop_p)(void *); +const struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, ufs_lookup }, /* lookup */ + { &vop_create_desc, ufs_create }, /* create */ + { &vop_whiteout_desc, ufs_whiteout }, /* whiteout */ + { &vop_mknod_desc, ufs_mknod }, /* mknod */ + { &vop_open_desc, ufs_open }, /* open */ + { &vop_close_desc, ufs_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, ufs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, ffs_read }, /* read */ + { &vop_write_desc, ffs_write }, /* write */ + { &vop_ioctl_desc, ufs_ioctl }, /* ioctl */ + { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */ + { &vop_poll_desc, ufs_poll }, /* poll */ + { &vop_kqfilter_desc, genfs_kqfilter }, /* kqfilter */ + { &vop_revoke_desc, ufs_revoke }, /* revoke */ + { &vop_mmap_desc, ufs_mmap }, /* mmap */ + { &vop_fsync_desc, ffs_fsync }, /* fsync */ + { &vop_seek_desc, ufs_seek }, /* seek */ + { &vop_remove_desc, ufs_remove }, /* remove */ + { &vop_link_desc, ufs_link }, /* link */ + { &vop_rename_desc, ufs_rename }, /* rename */ + { &vop_mkdir_desc, ufs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, ufs_rmdir }, /* rmdir */ + { &vop_symlink_desc, ufs_symlink }, /* symlink */ + { &vop_readdir_desc, ufs_readdir }, /* readdir */ + { &vop_readlink_desc, ufs_readlink }, /* readlink */ + { &vop_abortop_desc, ufs_abortop }, /* abortop */ + { &vop_inactive_desc, ufs_inactive }, /* inactive */ + { &vop_reclaim_desc, ffs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, ufs_bmap }, /* bmap */ + { &vop_strategy_desc, ufs_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */ + { &vop_advlock_desc, ufs_advlock }, /* advlock */ + { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ + { &vop_getpages_desc, genfs_getpages }, /* getpages */ + { &vop_putpages_desc, genfs_putpages }, /* putpages */ + { &vop_openextattr_desc, ffs_openextattr }, /* openextattr */ + { &vop_closeextattr_desc, ffs_closeextattr }, /* closeextattr */ + { &vop_getextattr_desc, ffs_getextattr }, /* getextattr */ + { &vop_setextattr_desc, ffs_setextattr }, /* setextattr */ + { &vop_listextattr_desc, ffs_listextattr }, /* listextattr */ + { &vop_deleteextattr_desc, ffs_deleteextattr }, /* deleteextattr */ + { NULL, NULL } +}; +const struct vnodeopv_desc ffs_vnodeop_opv_desc = + { &ffs_vnodeop_p, ffs_vnodeop_entries }; + +int (**ffs_specop_p)(void *); +const struct vnodeopv_entry_desc ffs_specop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, spec_lookup }, /* lookup */ + { &vop_create_desc, spec_create }, /* create */ + { &vop_mknod_desc, spec_mknod }, /* mknod */ + { &vop_open_desc, spec_open }, /* open */ + { &vop_close_desc, ufsspec_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, ufs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, ufsspec_read }, /* read */ + { &vop_write_desc, ufsspec_write }, /* write */ + { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ + { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */ + { &vop_poll_desc, spec_poll }, /* poll */ + { &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */ + { &vop_revoke_desc, spec_revoke }, /* revoke */ + { &vop_mmap_desc, spec_mmap }, /* mmap */ + { &vop_fsync_desc, ffs_spec_fsync }, /* fsync */ + { &vop_seek_desc, spec_seek }, /* seek */ + { &vop_remove_desc, spec_remove }, /* remove */ + { &vop_link_desc, spec_link }, /* link */ + { &vop_rename_desc, spec_rename }, /* rename */ + { &vop_mkdir_desc, spec_mkdir }, /* mkdir */ + { &vop_rmdir_desc, spec_rmdir }, /* rmdir */ + { &vop_symlink_desc, spec_symlink }, /* symlink */ + { &vop_readdir_desc, spec_readdir }, /* readdir */ + { &vop_readlink_desc, spec_readlink }, /* readlink */ + { &vop_abortop_desc, spec_abortop }, /* abortop */ + { &vop_inactive_desc, ufs_inactive }, /* inactive */ + { &vop_reclaim_desc, ffs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, spec_bmap }, /* bmap */ + { &vop_strategy_desc, spec_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ + { &vop_advlock_desc, spec_advlock }, /* advlock */ + { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ + { &vop_getpages_desc, spec_getpages }, /* getpages */ + { &vop_putpages_desc, spec_putpages }, /* putpages */ + { &vop_openextattr_desc, ffs_openextattr }, /* openextattr */ + { &vop_closeextattr_desc, ffs_closeextattr }, /* closeextattr */ + { &vop_getextattr_desc, ffs_getextattr }, /* getextattr */ + { &vop_setextattr_desc, ffs_setextattr }, /* setextattr */ + { &vop_listextattr_desc, ffs_listextattr }, /* listextattr */ + { &vop_deleteextattr_desc, ffs_deleteextattr }, /* deleteextattr */ + { NULL, NULL } +}; +const struct vnodeopv_desc ffs_specop_opv_desc = + { &ffs_specop_p, ffs_specop_entries }; + +int (**ffs_fifoop_p)(void *); +const struct vnodeopv_entry_desc ffs_fifoop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, vn_fifo_bypass }, /* lookup */ + { &vop_create_desc, vn_fifo_bypass }, /* create */ + { &vop_mknod_desc, vn_fifo_bypass }, /* mknod */ + { &vop_open_desc, vn_fifo_bypass }, /* open */ + { &vop_close_desc, ufsfifo_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, ufs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, ufsfifo_read }, /* read */ + { &vop_write_desc, ufsfifo_write }, /* write */ + { &vop_ioctl_desc, vn_fifo_bypass }, /* ioctl */ + { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */ + { &vop_poll_desc, vn_fifo_bypass }, /* poll */ + { &vop_kqfilter_desc, vn_fifo_bypass }, /* kqfilter */ + { &vop_revoke_desc, vn_fifo_bypass }, /* revoke */ + { &vop_mmap_desc, vn_fifo_bypass }, /* mmap */ + { &vop_fsync_desc, ffs_fsync }, /* fsync */ + { &vop_seek_desc, vn_fifo_bypass }, /* seek */ + { &vop_remove_desc, vn_fifo_bypass }, /* remove */ + { &vop_link_desc, vn_fifo_bypass }, /* link */ + { &vop_rename_desc, vn_fifo_bypass }, /* rename */ + { &vop_mkdir_desc, vn_fifo_bypass }, /* mkdir */ + { &vop_rmdir_desc, vn_fifo_bypass }, /* rmdir */ + { &vop_symlink_desc, vn_fifo_bypass }, /* symlink */ + { &vop_readdir_desc, vn_fifo_bypass }, /* readdir */ + { &vop_readlink_desc, vn_fifo_bypass }, /* readlink */ + { &vop_abortop_desc, vn_fifo_bypass }, /* abortop */ + { &vop_inactive_desc, ufs_inactive }, /* inactive */ + { &vop_reclaim_desc, ffs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, vn_fifo_bypass }, /* bmap */ + { &vop_strategy_desc, vn_fifo_bypass }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, vn_fifo_bypass }, /* pathconf */ + { &vop_advlock_desc, vn_fifo_bypass }, /* advlock */ + { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ + { &vop_putpages_desc, vn_fifo_bypass }, /* putpages */ + { &vop_openextattr_desc, ffs_openextattr }, /* openextattr */ + { &vop_closeextattr_desc, ffs_closeextattr }, /* closeextattr */ + { &vop_getextattr_desc, ffs_getextattr }, /* getextattr */ + { &vop_setextattr_desc, ffs_setextattr }, /* setextattr */ + { &vop_listextattr_desc, ffs_listextattr }, /* listextattr */ + { &vop_deleteextattr_desc, ffs_deleteextattr }, /* deleteextattr */ + { NULL, NULL } +}; +const struct vnodeopv_desc ffs_fifoop_opv_desc = + { &ffs_fifoop_p, ffs_fifoop_entries }; + +#include + +int +ffs_spec_fsync(void *v) +{ + struct vop_fsync_args /* { + struct vnode *a_vp; + kauth_cred_t a_cred; + int a_flags; + off_t a_offlo; + off_t a_offhi; + struct lwp *a_l; + } */ *ap = v; + int error, flags, uflags; + struct vnode *vp; + struct mount *mp; + + flags = ap->a_flags; + uflags = UPDATE_CLOSE | ((flags & FSYNC_WAIT) ? UPDATE_WAIT : 0); + vp = ap->a_vp; + mp = vp->v_mount; + + fstrans_start(mp, FSTRANS_LAZY); + + error = spec_fsync(v); + if (error) + goto out; + +#ifdef WAPBL + if (mp && mp->mnt_wapbl) { + /* + * Don't bother writing out metadata if the syncer is + * making the request. We will let the sync vnode + * write it out in a single burst through a call to + * VFS_SYNC(). + */ + if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0) + goto out; + if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE + | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) { + error = UFS_WAPBL_BEGIN(mp); + if (error != 0) + goto out; + error = ffs_update(vp, NULL, NULL, uflags); + UFS_WAPBL_END(mp); + } + goto out; + } +#endif /* WAPBL */ + + error = ffs_update(vp, NULL, NULL, uflags); + +out: + fstrans_done(mp); + return error; +} + +int +ffs_fsync(void *v) +{ + struct vop_fsync_args /* { + struct vnode *a_vp; + kauth_cred_t a_cred; + int a_flags; + off_t a_offlo; + off_t a_offhi; + struct lwp *a_l; + } */ *ap = v; + struct buf *bp; + int num, error, i; + struct indir ia[NIADDR + 1]; + int bsize; + daddr_t blk_high; + struct vnode *vp; + struct mount *mp; + + vp = ap->a_vp; + mp = vp->v_mount; + + fstrans_start(mp, FSTRANS_LAZY); + if ((ap->a_offlo == 0 && ap->a_offhi == 0) || (vp->v_type != VREG)) { + error = ffs_full_fsync(vp, ap->a_flags); + goto out; + } + + bsize = mp->mnt_stat.f_iosize; + blk_high = ap->a_offhi / bsize; + if (ap->a_offhi % bsize != 0) + blk_high++; + + /* + * First, flush all pages in range. + */ + + mutex_enter(vp->v_interlock); + error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo), + round_page(ap->a_offhi), PGO_CLEANIT | + ((ap->a_flags & FSYNC_WAIT) ? PGO_SYNCIO : 0)); + if (error) { + goto out; + } + +#ifdef WAPBL + KASSERT(vp->v_type == VREG); + if (mp->mnt_wapbl) { + /* + * Don't bother writing out metadata if the syncer is + * making the request. We will let the sync vnode + * write it out in a single burst through a call to + * VFS_SYNC(). + */ + if ((ap->a_flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0) { + fstrans_done(mp); + return 0; + } + error = 0; + if (vp->v_tag == VT_UFS && VTOI(vp)->i_flag & + (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY | + IN_MODIFIED | IN_ACCESSED)) { + error = UFS_WAPBL_BEGIN(mp); + if (error) { + fstrans_done(mp); + return error; + } + error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE | + ((ap->a_flags & FSYNC_WAIT) ? UPDATE_WAIT : 0)); + UFS_WAPBL_END(mp); + } + if (error || (ap->a_flags & FSYNC_NOLOG) != 0) { + fstrans_done(mp); + return error; + } + error = wapbl_flush(mp->mnt_wapbl, 0); + fstrans_done(mp); + return error; + } +#endif /* WAPBL */ + + /* + * Then, flush indirect blocks. + */ + + if (blk_high >= NDADDR) { + error = ufs_getlbns(vp, blk_high, ia, &num); + if (error) + goto out; + + mutex_enter(&bufcache_lock); + for (i = 0; i < num; i++) { + if ((bp = incore(vp, ia[i].in_lbn)) == NULL) + continue; + if ((bp->b_cflags & BC_BUSY) != 0 || + (bp->b_oflags & BO_DELWRI) == 0) + continue; + bp->b_cflags |= BC_BUSY | BC_VFLUSH; + mutex_exit(&bufcache_lock); + bawrite(bp); + mutex_enter(&bufcache_lock); + } + mutex_exit(&bufcache_lock); + } + + if (ap->a_flags & FSYNC_WAIT) { + mutex_enter(vp->v_interlock); + while (vp->v_numoutput > 0) + cv_wait(&vp->v_cv, vp->v_interlock); + mutex_exit(vp->v_interlock); + } + + error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE | + (((ap->a_flags & (FSYNC_WAIT | FSYNC_DATAONLY)) == FSYNC_WAIT) + ? UPDATE_WAIT : 0)); + + if (error == 0 && ap->a_flags & FSYNC_CACHE) { + int l = 0; + VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE, + curlwp->l_cred); + } + +out: + fstrans_done(mp); + return error; +} + +/* + * Synch an open file. Called for VOP_FSYNC(). + */ +/* ARGSUSED */ +int +ffs_full_fsync(struct vnode *vp, int flags) +{ + int error, i, uflags; + struct mount *mp; + + KASSERT(vp->v_tag == VT_UFS); + KASSERT(VTOI(vp) != NULL); + KASSERT(vp->v_type != VCHR && vp->v_type != VBLK); + + error = 0; + uflags = UPDATE_CLOSE | ((flags & FSYNC_WAIT) ? UPDATE_WAIT : 0); + + mp = vp->v_mount; + + /* + * Flush all dirty data associated with the vnode. + */ + if (vp->v_type == VREG) { + int pflags = PGO_ALLPAGES | PGO_CLEANIT; + + if ((flags & FSYNC_WAIT)) + pflags |= PGO_SYNCIO; + if (fstrans_getstate(mp) == FSTRANS_SUSPENDING) + pflags |= PGO_FREE; + mutex_enter(vp->v_interlock); + error = VOP_PUTPAGES(vp, 0, 0, pflags); + if (error) + return error; + } + +#ifdef WAPBL + if (mp && mp->mnt_wapbl) { + /* + * Don't bother writing out metadata if the syncer is + * making the request. We will let the sync vnode + * write it out in a single burst through a call to + * VFS_SYNC(). + */ + if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0) + return 0; + + if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE + | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) { + error = UFS_WAPBL_BEGIN(mp); + if (error) + return error; + error = ffs_update(vp, NULL, NULL, uflags); + UFS_WAPBL_END(mp); + } + if (error || (flags & FSYNC_NOLOG) != 0) + return error; + + /* + * Don't flush the log if the vnode being flushed + * contains no dirty buffers that could be in the log. + */ + if (!LIST_EMPTY(&vp->v_dirtyblkhd)) { + error = wapbl_flush(mp->mnt_wapbl, 0); + if (error) + return error; + } + + if ((flags & FSYNC_WAIT) != 0) { + mutex_enter(vp->v_interlock); + while (vp->v_numoutput != 0) + cv_wait(&vp->v_cv, vp->v_interlock); + mutex_exit(vp->v_interlock); + } + + return error; + } +#endif /* WAPBL */ + + error = vflushbuf(vp, (flags & FSYNC_WAIT) != 0); + if (error == 0) + error = ffs_update(vp, NULL, NULL, uflags); + if (error == 0 && (flags & FSYNC_CACHE) != 0) { + i = 1; + (void)VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &i, FWRITE, + kauth_cred_get()); + } + + return error; +} + +/* + * Reclaim an inode so that it can be used for other purposes. + */ +int +ffs_reclaim(void *v) +{ + struct vop_reclaim_args /* { + struct vnode *a_vp; + struct lwp *a_l; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct mount *mp = vp->v_mount; + struct ufsmount *ump = ip->i_ump; + void *data; + int error; + + fstrans_start(mp, FSTRANS_LAZY); + /* + * The inode must be freed and updated before being removed + * from its hash chain. Other threads trying to gain a hold + * on the inode will be stalled because it is locked (VI_XLOCK). + */ + error = UFS_WAPBL_BEGIN(mp); + if (error) { + fstrans_done(mp); + return error; + } + if (ip->i_nlink <= 0 && ip->i_omode != 0 && + (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) + ffs_vfree(vp, ip->i_number, ip->i_omode); + UFS_WAPBL_END(mp); + if ((error = ufs_reclaim(vp)) != 0) { + fstrans_done(mp); + return (error); + } + if (ip->i_din.ffs1_din != NULL) { + if (ump->um_fstype == UFS1) + pool_cache_put(ffs_dinode1_cache, ip->i_din.ffs1_din); + else + pool_cache_put(ffs_dinode2_cache, ip->i_din.ffs2_din); + } + /* + * To interlock with ffs_sync(). + */ + genfs_node_destroy(vp); + mutex_enter(vp->v_interlock); + data = vp->v_data; + vp->v_data = NULL; + mutex_exit(vp->v_interlock); + + /* + * XXX MFS ends up here, too, to free an inode. Should we create + * XXX a separate pool for MFS inodes? + */ + pool_cache_put(ffs_inode_cache, data); + fstrans_done(mp); + return (0); +} + +/* + * Return the last logical file offset that should be written for this file + * if we're doing a write that ends at "size". + */ + +void +ffs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags) +{ + struct inode *ip = VTOI(vp); + struct fs *fs = ip->i_fs; + daddr_t olbn, nlbn; + + olbn = lblkno(fs, ip->i_size); + nlbn = lblkno(fs, size); + if (nlbn < NDADDR && olbn <= nlbn) { + *eobp = fragroundup(fs, size); + } else { + *eobp = blkroundup(fs, size); + } +} + +int +ffs_openextattr(void *v) +{ + struct vop_openextattr_args /* { + struct vnode *a_vp; + kauth_cred_t a_cred; + struct proc *a_p; + } */ *ap = v; + struct inode *ip = VTOI(ap->a_vp); + struct fs *fs = ip->i_fs; + + /* Not supported for UFS1 file systems. */ + if (fs->fs_magic == FS_UFS1_MAGIC) + return (EOPNOTSUPP); + + /* XXX Not implemented for UFS2 file systems. */ + return (EOPNOTSUPP); +} + +int +ffs_closeextattr(void *v) +{ + struct vop_closeextattr_args /* { + struct vnode *a_vp; + int a_commit; + kauth_cred_t a_cred; + struct proc *a_p; + } */ *ap = v; + struct inode *ip = VTOI(ap->a_vp); + struct fs *fs = ip->i_fs; + + /* Not supported for UFS1 file systems. */ + if (fs->fs_magic == FS_UFS1_MAGIC) + return (EOPNOTSUPP); + + /* XXX Not implemented for UFS2 file systems. */ + return (EOPNOTSUPP); +} + +int +ffs_getextattr(void *v) +{ + struct vop_getextattr_args /* { + struct vnode *a_vp; + int a_attrnamespace; + const char *a_name; + struct uio *a_uio; + size_t *a_size; + kauth_cred_t a_cred; + struct proc *a_p; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct fs *fs = ip->i_fs; + + if (fs->fs_magic == FS_UFS1_MAGIC) { +#ifdef UFS_EXTATTR + int error; + + fstrans_start(vp->v_mount, FSTRANS_SHARED); + error = ufs_getextattr(ap); + fstrans_done(vp->v_mount); + return error; +#else + return (EOPNOTSUPP); +#endif + } + + /* XXX Not implemented for UFS2 file systems. */ + return (EOPNOTSUPP); +} + +int +ffs_setextattr(void *v) +{ + struct vop_setextattr_args /* { + struct vnode *a_vp; + int a_attrnamespace; + const char *a_name; + struct uio *a_uio; + kauth_cred_t a_cred; + struct proc *a_p; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct fs *fs = ip->i_fs; + + if (fs->fs_magic == FS_UFS1_MAGIC) { +#ifdef UFS_EXTATTR + int error; + + fstrans_start(vp->v_mount, FSTRANS_SHARED); + error = ufs_setextattr(ap); + fstrans_done(vp->v_mount); + return error; +#else + return (EOPNOTSUPP); +#endif + } + + /* XXX Not implemented for UFS2 file systems. */ + return (EOPNOTSUPP); +} + +int +ffs_listextattr(void *v) +{ + struct vop_listextattr_args /* { + struct vnode *a_vp; + int a_attrnamespace; + struct uio *a_uio; + size_t *a_size; + kauth_cred_t a_cred; + struct proc *a_p; + } */ *ap = v; + struct inode *ip = VTOI(ap->a_vp); + struct fs *fs = ip->i_fs; + + if (fs->fs_magic == FS_UFS1_MAGIC) { +#ifdef UFS_EXTATTR + struct vnode *vp = ap->a_vp; + int error; + + fstrans_start(vp->v_mount, FSTRANS_SHARED); + error = ufs_listextattr(ap); + fstrans_done(vp->v_mount); + return error; +#else + return (EOPNOTSUPP); +#endif + } + + /* XXX Not implemented for UFS2 file systems. */ + return (EOPNOTSUPP); +} + +int +ffs_deleteextattr(void *v) +{ + struct vop_deleteextattr_args /* { + struct vnode *a_vp; + int a_attrnamespace; + kauth_cred_t a_cred; + struct proc *a_p; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct fs *fs = ip->i_fs; + + if (fs->fs_magic == FS_UFS1_MAGIC) { +#ifdef UFS_EXTATTR + int error; + + fstrans_start(vp->v_mount, FSTRANS_SHARED); + error = ufs_deleteextattr(ap); + fstrans_done(vp->v_mount); + return error; +#else + return (EOPNOTSUPP); +#endif + } + + /* XXX Not implemented for UFS2 file systems. */ + return (EOPNOTSUPP); +} diff --git a/sys/ufs/ffs/ffs_wapbl.c b/sys/ufs/ffs/ffs_wapbl.c new file mode 100644 index 000000000..aa6b2dae1 --- /dev/null +++ b/sys/ufs/ffs/ffs_wapbl.c @@ -0,0 +1,883 @@ +/* $NetBSD: ffs_wapbl.c,v 1.17 2010/12/24 13:38:57 mlelstv Exp $ */ + +/*- + * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ffs_wapbl.c,v 1.17 2010/12/24 13:38:57 mlelstv Exp $"); + +#define WAPBL_INTERNAL + +#if defined(_KERNEL_OPT) +#include "opt_ffs.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#undef WAPBL_DEBUG +#ifdef WAPBL_DEBUG +int ffs_wapbl_debug = 1; +#define DPRINTF(fmt, args...) \ +do { \ + if (ffs_wapbl_debug) \ + printf("%s:%d "fmt, __func__ , __LINE__, ##args); \ +} while (/* CONSTCOND */0) +#else +#define DPRINTF(fmt, args...) \ +do { \ + /* nothing */ \ +} while (/* CONSTCOND */0) +#endif + +static int ffs_superblock_layout(struct fs *); +static int wapbl_log_position(struct mount *, struct fs *, struct vnode *, + daddr_t *, size_t *, size_t *, uint64_t *); +static int wapbl_create_infs_log(struct mount *, struct fs *, struct vnode *, + daddr_t *, size_t *, uint64_t *); +static void wapbl_find_log_start(struct mount *, struct vnode *, off_t, + daddr_t *, daddr_t *, size_t *); +static int wapbl_remove_log(struct mount *); +static int wapbl_allocate_log_file(struct mount *, struct vnode *, + daddr_t *, size_t *, uint64_t *); + +/* + * Return the super block layout format - UFS1 or UFS2. + * WAPBL only works with UFS2 layout (which is still available + * with FFSv1). + * + * XXX Should this be in ufs/ffs/fs.h? Same style of check is + * also used in ffs_alloc.c in a few places. + */ +static int +ffs_superblock_layout(struct fs *fs) +{ + if ((fs->fs_magic == FS_UFS1_MAGIC) && + ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) + return 1; + else + return 2; +} + +/* + * This function is invoked after a log is replayed to + * disk to perform logical cleanup actions as described by + * the log + */ +void +ffs_wapbl_replay_finish(struct mount *mp) +{ + struct wapbl_replay *wr = mp->mnt_wapbl_replay; + int i; + int error; + + if (!wr) + return; + + KDASSERT((mp->mnt_flag & MNT_RDONLY) == 0); + + for (i = 0; i < wr->wr_inodescnt; i++) { + struct vnode *vp; + struct inode *ip; + error = VFS_VGET(mp, wr->wr_inodes[i].wr_inumber, &vp); + if (error) { + printf("ffs_wapbl_replay_finish: " + "unable to cleanup inode %" PRIu32 "\n", + wr->wr_inodes[i].wr_inumber); + continue; + } + ip = VTOI(vp); + KDASSERT(wr->wr_inodes[i].wr_inumber == ip->i_number); +#ifdef WAPBL_DEBUG + printf("ffs_wapbl_replay_finish: " + "cleaning inode %" PRIu64 " size=%" PRIu64 " mode=%o nlink=%d\n", + ip->i_number, ip->i_size, ip->i_mode, ip->i_nlink); +#endif + KASSERT(ip->i_nlink == 0); + + /* + * The journal may have left partially allocated inodes in mode + * zero. This may occur if a crash occurs betweeen the node + * allocation in ffs_nodeallocg and when the node is properly + * initialized in ufs_makeinode. If so, just dallocate them. + */ + if (ip->i_mode == 0) { + UFS_WAPBL_BEGIN(mp); + ffs_vfree(vp, ip->i_number, wr->wr_inodes[i].wr_imode); + UFS_WAPBL_END(mp); + } + vput(vp); + } + wapbl_replay_stop(wr); + wapbl_replay_free(wr); + mp->mnt_wapbl_replay = NULL; +} + +/* Callback for wapbl */ +void +ffs_wapbl_sync_metadata(struct mount *mp, daddr_t *deallocblks, + int *dealloclens, int dealloccnt) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + int i, error; + +#ifdef WAPBL_DEBUG_INODES + ufs_wapbl_verify_inodes(mp, "ffs_wapbl_sync_metadata"); +#endif + + for (i = 0; i< dealloccnt; i++) { + /* + * blkfree errors are unreported, might silently fail + * if it cannot read the cylinder group block + */ + ffs_blkfree(fs, ump->um_devvp, + dbtofsb(fs, deallocblks[i]), dealloclens[i], -1); + } + + fs->fs_fmod = 0; + fs->fs_time = time_second; + error = ffs_cgupdate(ump, 0); + KASSERT(error == 0); +} + +void +ffs_wapbl_abort_sync_metadata(struct mount *mp, daddr_t *deallocblks, + int *dealloclens, int dealloccnt) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + int i; + + for (i = 0; i < dealloccnt; i++) { + /* + * Since the above blkfree may have failed, this blkalloc might + * fail as well, so don't check its error. Note that if the + * blkfree succeeded above, then this shouldn't fail because + * the buffer will be locked in the current transaction. + */ + ffs_blkalloc_ump(ump, dbtofsb(fs, deallocblks[i]), + dealloclens[i]); + } +} + +static int +wapbl_remove_log(struct mount *mp) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + struct vnode *vp; + struct inode *ip; + ino_t log_ino; + int error; + + /* If super block layout is too old to support WAPBL, return */ + if (ffs_superblock_layout(fs) < 2) + return 0; + + /* If all the log locators are 0, just clean up */ + if (fs->fs_journallocs[0] == 0 && + fs->fs_journallocs[1] == 0 && + fs->fs_journallocs[2] == 0 && + fs->fs_journallocs[3] == 0) { + DPRINTF("empty locators, just clear\n"); + goto done; + } + + switch (fs->fs_journal_location) { + case UFS_WAPBL_JOURNALLOC_NONE: + /* nothing! */ + DPRINTF("no log\n"); + break; + + case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: + log_ino = fs->fs_journallocs[UFS_WAPBL_INFS_INO]; + DPRINTF("in-fs log, ino = %" PRId64 "\n",log_ino); + + /* if no existing log inode, just clear all fields and bail */ + if (log_ino == 0) + goto done; + error = VFS_VGET(mp, log_ino, &vp); + if (error != 0) { + printf("ffs_wapbl: vget failed %d\n", + error); + /* clear out log info on error */ + goto done; + } + ip = VTOI(vp); + KASSERT(log_ino == ip->i_number); + if ((ip->i_flags & SF_LOG) == 0) { + printf("ffs_wapbl: try to clear non-log inode " + "%" PRId64 "\n", log_ino); + vput(vp); + /* clear out log info on error */ + goto done; + } + + /* + * remove the log inode by setting its link count back + * to zero and bail. + */ + ip->i_nlink = 0; + DIP_ASSIGN(ip, nlink, 0); + vput(vp); + + case UFS_WAPBL_JOURNALLOC_END_PARTITION: + DPRINTF("end-of-partition log\n"); + /* no extra work required */ + break; + + default: + printf("ffs_wapbl: unknown journal type %d\n", + fs->fs_journal_location); + break; + } + + +done: + /* Clear out all previous knowledge of journal */ + fs->fs_journal_version = 0; + fs->fs_journal_location = 0; + fs->fs_journal_flags = 0; + fs->fs_journallocs[0] = 0; + fs->fs_journallocs[1] = 0; + fs->fs_journallocs[2] = 0; + fs->fs_journallocs[3] = 0; + (void) ffs_sbupdate(ump, MNT_WAIT); + + return 0; +} + +int +ffs_wapbl_start(struct mount *mp) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + struct vnode *devvp = ump->um_devvp; + daddr_t off; + size_t count; + size_t blksize; + uint64_t extradata; + int error; + + if (mp->mnt_wapbl == NULL) { + if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CLEAR_LOG) { + /* Clear out any existing journal file */ + error = wapbl_remove_log(mp); + if (error != 0) + return error; + } + + if (mp->mnt_flag & MNT_LOG) { + KDASSERT(fs->fs_ronly == 0); + + /* WAPBL needs UFS2 format super block */ + if (ffs_superblock_layout(fs) < 2) { + printf("%s fs superblock in old format, " + "not journaling\n", + VFSTOUFS(mp)->um_fs->fs_fsmnt); + mp->mnt_flag &= ~MNT_LOG; + return EINVAL; + } + + error = wapbl_log_position(mp, fs, devvp, &off, + &count, &blksize, &extradata); + if (error) + return error; + + error = wapbl_start(&mp->mnt_wapbl, mp, devvp, off, + count, blksize, mp->mnt_wapbl_replay, + ffs_wapbl_sync_metadata, + ffs_wapbl_abort_sync_metadata); + if (error) + return error; + + mp->mnt_wapbl_op = &wapbl_ops; + +#ifdef WAPBL_DEBUG + printf("%s: enabling logging\n", fs->fs_fsmnt); +#endif + + if ((fs->fs_flags & FS_DOWAPBL) == 0) { + UFS_WAPBL_BEGIN(mp); + fs->fs_flags |= FS_DOWAPBL; + error = ffs_sbupdate(ump, MNT_WAIT); + if (error) { + UFS_WAPBL_END(mp); + ffs_wapbl_stop(mp, MNT_FORCE); + return error; + } + UFS_WAPBL_END(mp); + error = wapbl_flush(mp->mnt_wapbl, 1); + if (error) { + ffs_wapbl_stop(mp, MNT_FORCE); + return error; + } + } + } else if (fs->fs_flags & FS_DOWAPBL) { + fs->fs_fmod = 1; + fs->fs_flags &= ~FS_DOWAPBL; + } + } + + /* + * It is recommended that you finish replay with logging enabled. + * However, even if logging is not enabled, the remaining log + * replay should be safely recoverable with an fsck, so perform + * it anyway. + */ + if ((fs->fs_ronly == 0) && mp->mnt_wapbl_replay) { + int saveflag = mp->mnt_flag & MNT_RDONLY; + /* + * Make sure MNT_RDONLY is not set so that the inode + * cleanup in ufs_inactive will actually do its work. + */ + mp->mnt_flag &= ~MNT_RDONLY; + ffs_wapbl_replay_finish(mp); + mp->mnt_flag |= saveflag; + KASSERT(fs->fs_ronly == 0); + } + + return 0; +} + +int +ffs_wapbl_stop(struct mount *mp, int force) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + int error; + + if (mp->mnt_wapbl) { + KDASSERT(fs->fs_ronly == 0); + + /* + * Make sure turning off FS_DOWAPBL is only removed + * as the only change in the final flush since otherwise + * a transaction may reorder writes. + */ + error = wapbl_flush(mp->mnt_wapbl, 1); + if (error && !force) + return error; + if (error && force) + goto forceout; + error = UFS_WAPBL_BEGIN(mp); + if (error && !force) + return error; + if (error && force) + goto forceout; + KASSERT(fs->fs_flags & FS_DOWAPBL); + + fs->fs_flags &= ~FS_DOWAPBL; + error = ffs_sbupdate(ump, MNT_WAIT); + KASSERT(error == 0); /* XXX a bit drastic! */ + UFS_WAPBL_END(mp); + forceout: + error = wapbl_stop(mp->mnt_wapbl, force); + if (error) { + KASSERT(!force); + fs->fs_flags |= FS_DOWAPBL; + return error; + } + fs->fs_flags &= ~FS_DOWAPBL; /* Repeat in case of forced error */ + mp->mnt_wapbl = NULL; + +#ifdef WAPBL_DEBUG + printf("%s: disabled logging\n", fs->fs_fsmnt); +#endif + } + + return 0; +} + +int +ffs_wapbl_replay_start(struct mount *mp, struct fs *fs, struct vnode *devvp) +{ + int error; + daddr_t off; + size_t count; + size_t blksize; + uint64_t extradata; + + /* + * WAPBL needs UFS2 format super block, if we got here with a + * UFS1 format super block something is amiss... + */ + if (ffs_superblock_layout(fs) < 2) + return EINVAL; + + error = wapbl_log_position(mp, fs, devvp, &off, &count, &blksize, + &extradata); + + if (error) + return error; + + error = wapbl_replay_start(&mp->mnt_wapbl_replay, devvp, off, + count, blksize); + if (error) + return error; + + mp->mnt_wapbl_op = &wapbl_ops; + + return 0; +} + +/* + * If the superblock doesn't already have a recorded journal location + * then we allocate the journal in one of two positions: + * + * - At the end of the partition after the filesystem if there's + * enough space. "Enough space" is defined as >= 1MB of journal + * per 1GB of filesystem or 64MB, whichever is smaller. + * + * - Inside the filesystem. We try to allocate a contiguous journal + * based on the total filesystem size - the target is 1MB of journal + * per 1GB of filesystem, up to a maximum journal size of 64MB. As + * a worst case allowing for fragmentation, we'll allocate a journal + * 1/4 of the desired size but never smaller than 1MB. + * + * XXX In the future if we allow for non-contiguous journal files we + * can tighten the above restrictions. + * + * XXX + * These seems like a lot of duplication both here and in some of + * the userland tools (fsck_ffs, dumpfs, tunefs) with similar + * "switch (fs_journal_location)" constructs. Can we centralise + * this sort of code somehow/somewhere? + */ +static int +wapbl_log_position(struct mount *mp, struct fs *fs, struct vnode *devvp, + daddr_t *startp, size_t *countp, size_t *blksizep, uint64_t *extradatap) +{ + struct ufsmount *ump = VFSTOUFS(mp); + daddr_t logstart, logend, desired_logsize; + uint64_t numsecs; + unsigned secsize; + int error, location; + + if (fs->fs_journal_version == UFS_WAPBL_VERSION) { + switch (fs->fs_journal_location) { + case UFS_WAPBL_JOURNALLOC_END_PARTITION: + DPRINTF("found existing end-of-partition log\n"); + *startp = fs->fs_journallocs[UFS_WAPBL_EPART_ADDR]; + *countp = fs->fs_journallocs[UFS_WAPBL_EPART_COUNT]; + *blksizep = fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]; + DPRINTF(" start = %" PRId64 ", size = %zu, " + "blksize = %zu\n", *startp, *countp, *blksizep); + return 0; + + case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: + DPRINTF("found existing in-filesystem log\n"); + *startp = fs->fs_journallocs[UFS_WAPBL_INFS_ADDR]; + *countp = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; + *blksizep = fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; + DPRINTF(" start = %" PRId64 ", size = %zu, " + "blksize = %zu\n", *startp, *countp, *blksizep); + return 0; + + default: + printf("ffs_wapbl: unknown journal type %d\n", + fs->fs_journal_location); + return EINVAL; + } + } + + desired_logsize = + lfragtosize(fs, fs->fs_size) / UFS_WAPBL_JOURNAL_SCALE; + DPRINTF("desired log size = %" PRId64 " kB\n", desired_logsize / 1024); + desired_logsize = max(desired_logsize, UFS_WAPBL_MIN_JOURNAL_SIZE); + desired_logsize = min(desired_logsize, UFS_WAPBL_MAX_JOURNAL_SIZE); + DPRINTF("adjusted desired log size = %" PRId64 " kB\n", + desired_logsize / 1024); + + /* Is there space after after filesystem on partition for log? */ + logstart = fsbtodb(fs, fs->fs_size); + error = getdisksize(devvp, &numsecs, &secsize); + if (error) + return error; + KDASSERT(secsize != 0); + logend = btodb(numsecs * secsize); + + if (dbtob(logend - logstart) >= desired_logsize) { + DPRINTF("enough space, use end-of-partition log\n"); + + location = UFS_WAPBL_JOURNALLOC_END_PARTITION; + *blksizep = secsize; + + *startp = logstart; + *countp = (logend - logstart); + *extradatap = 0; + + /* convert to physical block numbers */ + *startp = dbtob(*startp) / secsize; + *countp = dbtob(*countp) / secsize; + + fs->fs_journallocs[UFS_WAPBL_EPART_ADDR] = *startp; + fs->fs_journallocs[UFS_WAPBL_EPART_COUNT] = *countp; + fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ] = *blksizep; + fs->fs_journallocs[UFS_WAPBL_EPART_UNUSED] = *extradatap; + } else { + DPRINTF("end-of-partition has only %" PRId64 " free\n", + logend - logstart); + + location = UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM; + *blksizep = secsize; + + error = wapbl_create_infs_log(mp, fs, devvp, + startp, countp, extradatap); + ffs_sync(mp, MNT_WAIT, FSCRED); + + /* convert to physical block numbers */ + *startp = dbtob(*startp) / secsize; + *countp = dbtob(*countp) / secsize; + + fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] = *startp; + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] = *countp; + fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ] = *blksizep; + fs->fs_journallocs[UFS_WAPBL_INFS_INO] = *extradatap; + } + + if (error == 0) { + /* update superblock with log location */ + fs->fs_journal_version = UFS_WAPBL_VERSION; + fs->fs_journal_location = location; + fs->fs_journal_flags = 0; + + error = ffs_sbupdate(ump, MNT_WAIT); + } + + return error; +} + +/* + * Try to create a journal log inside the filesystem. + */ +static int +wapbl_create_infs_log(struct mount *mp, struct fs *fs, struct vnode *devvp, + daddr_t *startp, size_t *countp, uint64_t *extradatap) +{ + struct vnode *vp, *rvp; + struct inode *ip; + int error; + + if ((error = VFS_ROOT(mp, &rvp)) != 0) + return error; + + error = UFS_VALLOC(rvp, 0 | S_IFREG, NOCRED, &vp); + if (mp->mnt_flag & MNT_UPDATE) { + vput(rvp); + } else { + VOP_UNLOCK(rvp); + vgone(rvp); + } + if (error != 0) + return error; + + vp->v_type = VREG; + ip = VTOI(vp); + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + ip->i_mode = 0 | IFREG; + DIP_ASSIGN(ip, mode, ip->i_mode); + ip->i_flags = SF_LOG; + DIP_ASSIGN(ip, flags, ip->i_flags); + ip->i_nlink = 1; + DIP_ASSIGN(ip, nlink, 1); + ffs_update(vp, NULL, NULL, UPDATE_WAIT); + + if ((error = wapbl_allocate_log_file(mp, vp, + startp, countp, extradatap)) != 0) { + /* + * If we couldn't allocate the space for the log file, + * remove the inode by setting its link count back to + * zero and bail. + */ + ip->i_nlink = 0; + DIP_ASSIGN(ip, nlink, 0); + VOP_UNLOCK(vp); + vgone(vp); + + return error; + } + + /* + * Now that we have the place-holder inode for the journal, + * we don't need the vnode ever again. + */ + VOP_UNLOCK(vp); + vgone(vp); + + return 0; +} + +int +wapbl_allocate_log_file(struct mount *mp, struct vnode *vp, + daddr_t *startp, size_t *countp, uint64_t *extradatap) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + daddr_t addr, indir_addr; + off_t logsize; + size_t size; + int error; + + logsize = 0; + /* check if there's a suggested log size */ + if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CREATE_LOG && + fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) + logsize = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; + + if (vp->v_size > 0) { + printf("%s: file size (%" PRId64 ") non zero\n", __func__, + vp->v_size); + return EEXIST; + } + wapbl_find_log_start(mp, vp, logsize, &addr, &indir_addr, &size); + if (addr == 0) { + printf("%s: log not allocated, largest extent is " + "%" PRId64 "MB\n", __func__, + lblktosize(fs, size) / (1024 * 1024)); + return ENOSPC; + } + + logsize = lblktosize(fs, size); /* final log size */ + + VTOI(vp)->i_ffs_first_data_blk = addr; + VTOI(vp)->i_ffs_first_indir_blk = indir_addr; + + error = GOP_ALLOC(vp, 0, logsize, B_CONTIG, FSCRED); + if (error) { + printf("%s: GOP_ALLOC error %d\n", __func__, error); + return error; + } + + *startp = fsbtodb(fs, addr); + *countp = btodb(logsize); + *extradatap = VTOI(vp)->i_number; + + return 0; +} + +/* + * Find a suitable location for the journal in the filesystem. + * + * Our strategy here is to look for a contiguous block of free space + * at least "logfile" MB in size (plus room for any indirect blocks). + * We start at the middle of the filesystem and check each cylinder + * group working outwards. If "logfile" MB is not available as a + * single contigous chunk, then return the address and size of the + * largest chunk found. + * + * XXX + * At what stage does the search fail? Is if the largest space we could + * find is less than a quarter the requested space reasonable? If the + * search fails entirely, return a block address if "0" it indicate this. + */ +static void +wapbl_find_log_start(struct mount *mp, struct vnode *vp, off_t logsize, + daddr_t *addr, daddr_t *indir_addr, size_t *size) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + struct vnode *devvp = ump->um_devvp; + struct cg *cgp; + struct buf *bp; + uint8_t *blksfree; + daddr_t blkno, best_addr, start_addr; + daddr_t desired_blks, min_desired_blks; + daddr_t freeblks, best_blks; + int bpcg, cg, error, fixedsize, indir_blks, n, s; +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + + if (logsize == 0) { + fixedsize = 0; /* We can adjust the size if tight */ + logsize = lfragtosize(fs, fs->fs_dsize) / + UFS_WAPBL_JOURNAL_SCALE; + DPRINTF("suggested log size = %" PRId64 "\n", logsize); + logsize = max(logsize, UFS_WAPBL_MIN_JOURNAL_SIZE); + logsize = min(logsize, UFS_WAPBL_MAX_JOURNAL_SIZE); + DPRINTF("adjusted log size = %" PRId64 "\n", logsize); + } else { + fixedsize = 1; + DPRINTF("fixed log size = %" PRId64 "\n", logsize); + } + + desired_blks = logsize / fs->fs_bsize; + DPRINTF("desired blocks = %" PRId64 "\n", desired_blks); + + /* add in number of indirect blocks needed */ + indir_blks = 0; + if (desired_blks >= NDADDR) { + struct indir indirs[NIADDR + 2]; + int num; + + error = ufs_getlbns(vp, desired_blks, indirs, &num); + if (error) { + printf("%s: ufs_getlbns failed, error %d!\n", + __func__, error); + goto bad; + } + + switch (num) { + case 2: + indir_blks = 1; /* 1st level indirect */ + break; + case 3: + indir_blks = 1 + /* 1st level indirect */ + 1 + /* 2nd level indirect */ + indirs[1].in_off + 1; /* extra 1st level indirect */ + break; + default: + printf("%s: unexpected numlevels %d from ufs_getlbns\n", + __func__, num); + *size = 0; + goto bad; + } + desired_blks += indir_blks; + } + DPRINTF("desired blocks = %" PRId64 " (including indirect)\n", + desired_blks); + + /* + * If a specific size wasn't requested, allow for a smaller log + * if we're really tight for space... + */ + min_desired_blks = desired_blks; + if (!fixedsize) + min_desired_blks = desired_blks / 4; + + /* Look at number of blocks per CG. If it's too small, bail early. */ + bpcg = fragstoblks(fs, fs->fs_fpg); + if (min_desired_blks > bpcg) { + printf("ffs_wapbl: cylinder group size of %" PRId64 " MB " + " is not big enough for journal\n", + lblktosize(fs, bpcg) / (1024 * 1024)); + goto bad; + } + + /* + * Start with the middle cylinder group, and search outwards in + * both directions until we either find the requested log size + * or reach the start/end of the file system. If we reach the + * start/end without finding enough space for the full requested + * log size, use the largest extent found if it is large enough + * to satisfy the our minimum size. + * + * XXX + * Can we just use the cluster contigsum stuff (esp on UFS2) + * here to simplify this search code? + */ + best_addr = 0; + best_blks = 0; + for (cg = fs->fs_ncg / 2, s = 0, n = 1; + best_blks < desired_blks && cg >= 0 && cg < fs->fs_ncg; + s++, n = -n, cg += n * s) { + DPRINTF("check cg %d of %d\n", cg, fs->fs_ncg); + error = bread(devvp, fsbtodb(fs, cgtod(fs, cg)), + fs->fs_cgsize, FSCRED, 0, &bp); + cgp = (struct cg *)bp->b_data; + if (error || !cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) { + brelse(bp, 0); + continue; + } + + blksfree = cg_blksfree(cgp, needswap); + + for (blkno = 0; blkno < bpcg;) { + /* look for next free block */ + /* XXX use scanc() and fragtbl[] here? */ + for (; blkno < bpcg - min_desired_blks; blkno++) + if (ffs_isblock(fs, blksfree, blkno)) + break; + + /* past end of search space in this CG? */ + if (blkno >= bpcg - min_desired_blks) + break; + + /* count how many free blocks in this extent */ + start_addr = blkno; + for (freeblks = 0; blkno < bpcg; blkno++, freeblks++) + if (!ffs_isblock(fs, blksfree, blkno)) + break; + + if (freeblks > best_blks) { + best_blks = freeblks; + best_addr = blkstofrags(fs, start_addr) + + cgbase(fs, cg); + + if (freeblks >= desired_blks) { + DPRINTF("found len %" PRId64 + " at offset %" PRId64 " in gc\n", + freeblks, start_addr); + break; + } + } + } + brelse(bp, 0); + } + DPRINTF("best found len = %" PRId64 ", wanted %" PRId64 + " at addr %" PRId64 "\n", best_blks, desired_blks, best_addr); + + if (best_blks < min_desired_blks) { + *addr = 0; + *indir_addr = 0; + } else { + /* put indirect blocks at start, and data blocks after */ + *addr = best_addr + blkstofrags(fs, indir_blks); + *indir_addr = best_addr; + } + *size = min(desired_blks, best_blks) - indir_blks; + return; + +bad: + *addr = 0; + *indir_addr = 0; + *size = 0; + return; +} diff --git a/include/ufs/ffs/fs.h b/sys/ufs/ffs/fs.h similarity index 100% rename from include/ufs/ffs/fs.h rename to sys/ufs/ffs/fs.h diff --git a/sys/ufs/files.ufs b/sys/ufs/files.ufs new file mode 100644 index 000000000..7bd59a003 --- /dev/null +++ b/sys/ufs/files.ufs @@ -0,0 +1,89 @@ +# $NetBSD: files.ufs,v 1.27 2011/11/24 15:51:31 ahoka Exp $ + +deffs FFS +deffs EXT2FS +deffs MFS +deffs LFS +deffs CHFS + +defflag opt_ffs.h FFS_EI FFS_NO_SNAPSHOT APPLE_UFS + UFS_DIRHASH + UFS_EXTATTR UFS_EXTATTR_AUTOSTART + +defflag opt_lfs.h LFS_KERNEL_RFW + +file ufs/ext2fs/ext2fs_alloc.c ext2fs +file ufs/ext2fs/ext2fs_balloc.c ext2fs +file ufs/ext2fs/ext2fs_bmap.c ext2fs +file ufs/ext2fs/ext2fs_bswap.c ext2fs +file ufs/ext2fs/ext2fs_inode.c ext2fs +file ufs/ext2fs/ext2fs_lookup.c ext2fs +file ufs/ext2fs/ext2fs_readwrite.c ext2fs +file ufs/ext2fs/ext2fs_subr.c ext2fs +file ufs/ext2fs/ext2fs_vfsops.c ext2fs +file ufs/ext2fs/ext2fs_vnops.c ext2fs + +file ufs/chfs/ebh.c chfs +file ufs/chfs/chfs_ihash.c chfs +file ufs/chfs/chfs_scan.c chfs +file ufs/chfs/chfs_write.c chfs +file ufs/chfs/chfs_vnode_cache.c chfs +file ufs/chfs/chfs_erase.c chfs +file ufs/chfs/chfs_build.c chfs +file ufs/chfs/chfs_wbuf.c chfs +file ufs/chfs/chfs_vnops.c chfs +file ufs/chfs/chfs_gc.c chfs +file ufs/chfs/chfs_nodeops.c chfs +file ufs/chfs/chfs_malloc.c chfs +file ufs/chfs/chfs_pool.c chfs +file ufs/chfs/debug.c chfs +file ufs/chfs/chfs_vnode.c chfs +file ufs/chfs/chfs_subr.c chfs +file ufs/chfs/chfs_vfsops.c chfs +file ufs/chfs/chfs_readinode.c chfs + +file ufs/ffs/ffs_alloc.c ffs | lfs | mfs | ext2fs | chfs +file ufs/ffs/ffs_balloc.c ffs | lfs | mfs | ext2fs | chfs +file ufs/ffs/ffs_bswap.c (ffs | mfs) & ffs_ei +file ufs/ffs/ffs_inode.c ffs | lfs | mfs | ext2fs | chfs +file ufs/ffs/ffs_snapshot.c ffs | lfs | mfs | ext2fs | chfs +file ufs/ffs/ffs_subr.c ffs | lfs | mfs | ext2fs | chfs +file ufs/ffs/ffs_tables.c ffs | lfs | mfs | ext2fs | chfs +file ufs/ffs/ffs_vfsops.c ffs | lfs | mfs | ext2fs | chfs +file ufs/ffs/ffs_vnops.c ffs | lfs | mfs | ext2fs | chfs +file ufs/ffs/ffs_wapbl.c ffs & wapbl +file ufs/ffs/ffs_appleufs.c ffs & apple_ufs +file ufs/ffs/ffs_quota2.c quota2 & (ffs | lfs | mfs | ext2fs | chfs) + +file ufs/lfs/lfs_alloc.c lfs +file ufs/lfs/lfs_balloc.c lfs +file ufs/lfs/lfs_bio.c lfs +file ufs/lfs/lfs_cksum.c lfs +file ufs/lfs/lfs_debug.c lfs +file ufs/lfs/lfs_inode.c lfs +file ufs/lfs/lfs_itimes.c lfs +file ufs/lfs/lfs_rfw.c lfs & lfs_kernel_rfw +file ufs/lfs/lfs_segment.c lfs +file ufs/lfs/lfs_subr.c lfs +file ufs/lfs/lfs_syscalls.c lfs +file ufs/lfs/lfs_vfsops.c lfs +file ufs/lfs/lfs_vnops.c lfs + +file ufs/mfs/mfs_vfsops.c mfs +file ufs/mfs/mfs_vnops.c mfs +file ufs/mfs/mfs_miniroot.c + +file ufs/ufs/ufs_bmap.c ffs | lfs | mfs | ext2fs | chfs +file ufs/ufs/ufs_dirhash.c (ffs | lfs | mfs | ext2fs | chfs) & ufs_dirhash +file ufs/ufs/ufs_extattr.c (ffs | mfs) & ufs_extattr +file ufs/ufs/ufs_ihash.c ffs | lfs | mfs | ext2fs +file ufs/ufs/ufs_inode.c ffs | lfs | mfs | ext2fs +file ufs/ufs/ufs_lookup.c ffs | lfs | mfs | ext2fs | chfs +file ufs/ufs/ufs_quota.c (quota | quota2) & (ffs | lfs | mfs | ext2fs | chfs) +file ufs/ufs/ufs_quota1.c quota & (ffs | lfs | mfs | ext2fs | chfs) +file ufs/ufs/ufs_quota2.c quota2 & (ffs | lfs | mfs | ext2fs | chfs) +file ufs/ufs/quota1_subr.c +file ufs/ufs/quota2_subr.c quota2 & (ffs | lfs | mfs | ext2fs | chfs) +file ufs/ufs/ufs_vfsops.c ffs | lfs | mfs | ext2fs | chfs +file ufs/ufs/ufs_vnops.c ffs | lfs | mfs | ext2fs | chfs +file ufs/ufs/ufs_wapbl.c ffs & wapbl diff --git a/sys/ufs/lfs/CHANGES b/sys/ufs/lfs/CHANGES new file mode 100644 index 000000000..dfad48551 --- /dev/null +++ b/sys/ufs/lfs/CHANGES @@ -0,0 +1,169 @@ +# $NetBSD: CHANGES,v 1.5 2005/12/11 12:25:26 christos Exp $ + +kernel: + +- Instead of blindly continuing when it encounters an Inode that is + locked by another process, lfs_markv will process the rest of the + inodes passed to it and then return EAGAIN. The cleaner will + recognize this and not mark the segment clean. When the cleaner runs + again, the segment containg the (formerly) locked inode will sort high + for cleaning, since it is now almost entirely empty. + +- A beginning has been made to test keeping atime information in the + Ifile, instead of on the inodes. This should make read-mostly + filesystems significantly faster, since the inodes will then remain + close to the data blocks on disk; but of course the ifile will be + somewhat larger. This code is not enabled, as it makes the format of + IFILEs change. + +- The superblock has been broken into two components: an on-disk + superblock using fixed-size types, exactly 512 bytes regardless of + architecture (or could be enlarged in multiples of the media block + size up to LFS_SBPAD); and an in-memory superblock containing the + information only useful to a running LFS, including segment pointers, + etc. The superblock checksumming code has been modified to make + future changes to the superblock format easier. + +- Because of the way that lfs_writeseg works, buffers are freed before + they are really written to disk: their contents are copied into large + buffers which are written async. Because the buffer cache does not + serve to throttle these writes, and malloced memory is used to hold them, + there is a danger of running out of kmem_map. To avoid this, a new + compile-time parameter, LFS_THROTTLE, is used as an upper bound for the + number of partial-segments allowed to be in progress writing at any + given time. + +- If the system crashes between the point that a checkpoint is scheduled + for writing and the time that the write completes, the filesystem + could be left in an inconsistent state (no valid checkpoints on + disk). To avoid this, we toggle between the first two superblocks + when checkpointing, and (if it is indicated that no roll-forward agent + exists) do not allow one checkpoint to occur before the last one has + completed. When the filesystem is mounted, it uses the *older* of the + first two superblocks. + +- DIROPs: + + The design of the LFS includes segregating vnodes used in directory + operations, so that they can be written at the same time during a + checkpoint, avoiding filesystem inconsistency after a crash. Code for + this was partially written for BSD4.4, but was not complete or enabled. + + In particular, vnodes marked VDIROP could be flushed by getnewvnode at + any time, negating the usefulness of marking a vnode VDIROP, since if + the filesystem then crashed it would be inconsistent. Now, when a + vnode is first marked VDIROP it is also referenced. To avoid running + out of vnodes, an attempt to mark more than LFS_MAXDIROP vnodes wth + VDIROP will sleep, and trigger a partial-segment write when no dirops + are active. + +- LFS maintains a linked list of free inode numbers in the Ifile; + accesses to this list are now protected by a simple lock. + +- lfs_vfree is not allowed to run while an inode has blocks scheduled + for writing, since that could trigger a miscounting in lfs_truncate. + +- lfs_balloc now correctly extends fragments, if a block is written + beyond the current end-of-file. + +- Blocks which have already been gathered into a partial-segment are not + allowed to be extended, since if they were, any blocks following them + would either be written in the wrong place, or overwrite other blocks. + +- The LFS buffer-header accounting, which triggers a partial-segment + write if too many buffer-headers are in use by the LFS subystem, has + been expanded to include *bytes* used in LFS buffers as well. + +- Reads of the Ifile, which almost always come from the cleaner, can no + longer trigger a partial-segment write, since this could cause a + deadlock. + +- Support has been added (but not tested, and currently disabled by + default) for true read-only filesystems. Currently, if a filesystem + is mounted read-only the cleaner can still operate on it, but this + obviously would not be true for read-only media. (I think the + original plan was for the roll-forward agent to operate using this + "feature"?) + +- If a fake buffer is created by lfs_markv and another process draws the + same block in and changes it, the fake buffer is now discarded and + replaced by the "real" buffer containing the new data. + +- An inode which has blocks gathered no longer has IN_MODIFIED set, but + still does in fact have dirty blocks attached. lfs_update will now + wait for such an inode's writes to complete before it runs, + suppressing a panic in vinvalbuf. + +- Many filesystem operations now update the Ifile's mtime, allowing the + cleaner to detect when the filesystem is idle, and clean more + vigorously during such times (cf. Blackwell et al., 1995). + +- When writing a partial-segment, make sure that the current segment is + still marked ACTIVE afterward (otherwise the cleaner might try to + clean it, since it might well be mostly empty). + +- Don't trust the cleaner so much. Sort the blocks during gathering, + even if they came from the cleaner; verify the location of on-disk + inodes, even if the cleaner says it knows where they came from. + +- The cleaning code (lfs_markv in particular) has been entirely + rewritten, and the partial-segment writing code changed to match. + Lfs_markv no longer uses its own implementation of lfs_segwrite, but + marks inodes with IN_CLEANING to differentiate them from the + non-cleaning inodes. This change fixes numerous problems with the old + cleaner, including a buffer overrun, and lost extensions in active + fragments. lfs_bmapv looks up and returns the addresses of inode + blocks, so the cleaner can do something intelligent with them. + + If IN_CLEANING is set on an inode during partial-segment write, only fake + buffers will be written, and IN_MODIFIED will not be cleared, saving + us from a panic in vinvalbuf. The addition of IN_CLEANING also allows + dirops to be active while cleaning is in progress; since otherwise + buffers engaged in active dirops might be written ahead of schedule, + and cause an inconsistent checkpoint to be written to disk. + + (XXX - even now, DIROP blocks can sometimes be written to disk, if we + are cleaning the same blocks as are active? Grr, I don't see a good + solution for this!) + +- Added sysctl entries for LFS. In particular, `writeindir' controls + whether indirect blocks are written during non-checkpoint writes. + (Since there is no roll-forward agent as yet, there is no penalty in + not writing indirect blocks.) + +- Wake up the cleaner at fs-unmount time, so it can die (if we unmount + and then remount, we could conceivably get more than one cleaner + operating at once). + +newfs_lfs: + +- The ifile inode is now created with the schg flag set, since nothing + ever modifies it. This could be a pain for the roll-forward agent, + but since that should really run *before* the filesystem is mounted, + I don't care. + +- For large disks, it may be necessary to write one or more indirect + blocks when the ifile inode is created. Newlfs has been changed to + write the first indirect block, if necessary. It should instead just + build a set of inodes and blocks, and then use the partial-segment + writing routine mentioned above to write an ifile of whatever size is + desired. + +lfs_cleanerd: + +- Now writes information to the syslog. + +- Can now deal properly with fragments. + +- Sometimes, the cleaner can die. (Why?) If this happens and we don't + notice, we're screwed, since the fs will overfill. So, the invoked + cleaner now spawns itself repeatedly, a la init(8), to ensure that a + cleaner is always present to clean the fs. + +- Added a flag to clean more actively, not on low load average but + filesystem inactivity; a la Blackwell et al., 1995. + +fsck_lfs: + +- Exists, although it currently cannot actually fix anything (it is a + diagnostic tool only at this point). diff --git a/sys/ufs/lfs/Makefile b/sys/ufs/lfs/Makefile new file mode 100644 index 000000000..bb61c7b44 --- /dev/null +++ b/sys/ufs/lfs/Makefile @@ -0,0 +1,7 @@ +# $NetBSD: Makefile,v 1.1 1998/06/12 23:23:12 cgd Exp $ + +INCSDIR= /usr/include/ufs/lfs + +INCS= lfs.h lfs_extern.h + +.include diff --git a/sys/ufs/lfs/README b/sys/ufs/lfs/README new file mode 100644 index 000000000..827edbf92 --- /dev/null +++ b/sys/ufs/lfs/README @@ -0,0 +1,137 @@ +# $NetBSD: README,v 1.3 1999/03/15 00:46:47 perseant Exp $ + +# @(#)README 8.1 (Berkeley) 6/11/93 + +The file system is reasonably stable...I think. + +For details on the implementation, performance and why garbage +collection always wins, see Dr. Margo Seltzer's thesis available for +anonymous ftp from toe.cs.berkeley.edu, in the directory +pub/personal/margo/thesis.ps.Z, or the January 1993 USENIX paper. + +---------- +The disk is laid out in segments. The first segment starts 8K into the +disk (the first 8K is used for boot information). Each segment is composed +of the following: + + An optional super block + One or more groups of: + segment summary + 0 or more data blocks + 0 or more inode blocks + +The segment summary and inode/data blocks start after the super block (if +present), and grow toward the end of the segment. + + _______________________________________________ + | | | | | + | summary | data/inode | summary | data/inode | + | block | blocks | block | blocks | ... + |_________|____________|_________|____________| + +The data/inode blocks following a summary block are described by the +summary block. In order to permit the segment to be written in any order +and in a forward direction only, a checksum is calculated across the +blocks described by the summary. Additionally, the summary is checksummed +and timestamped. Both of these are intended for recovery; the former is +to make it easy to determine that it *is* a summary block and the latter +is to make it easy to determine when recovery is finished for partially +written segments. These checksums are also used by the cleaner. + + Summary block (detail) + ________________ + | sum cksum | + | data cksum | + | next segment | + | timestamp | + | FINFO count | + | inode count | + | flags | + |______________| + | FINFO-1 | 0 or more file info structures, identifying the + | . | blocks in the segment. + | . | + | . | + | FINFO-N | + | inode-N | + | . | + | . | + | . | 0 or more inode daddr_t's, identifying the inode + | inode-1 | blocks in the segment. + |______________| + +Inode blocks are blocks of on-disk inodes in the same format as those in +the FFS. However, spare[0] contains the inode number of the inode so we +can find a particular inode on a page. They are packed page_size / +sizeof(inode) to a block. Data blocks are exactly as in the FFS. Both +inodes and data blocks move around the file system at will. + +The file system is described by a super-block which is replicated and +occurs as the first block of the first and other segments. (The maximum +number of super-blocks is MAXNUMSB). Each super-block maintains a list +of the disk addresses of all the super-blocks. The super-block maintains +a small amount of checkpoint information, essentially just enough to find +the inode for the IFILE (fs->lfs_idaddr). + +The IFILE is visible in the file system, as inode number IFILE_INUM. It +contains information shared between the kernel and various user processes. + + Ifile (detail) + ________________ + | cleaner info | Cleaner information per file system. (Page + | | granularity.) + |______________| + | segment | Space available and last modified times per + | usage table | segment. (Page granularity.) + |______________| + | IFILE-1 | Per inode status information: current version #, + | . | if currently allocated, last access time and + | . | current disk address of containing inode block. + | . | If current disk address is LFS_UNUSED_DADDR, the + | IFILE-N | inode is not in use, and it's on the free list. + |______________| + + +First Segment at Creation Time: +_____________________________________________________________ +| | | | | | | | +| 8K pad | Super | summary | inode | ifile | root | l + f | +| | block | | block | | dir | dir | +|________|_______|_________|_______|_______|_______|_______| + ^ + Segment starts here. + +Some differences from the Sprite LFS implementation. + +1. The LFS implementation placed the ifile metadata and the super block + at fixed locations. This implementation replicates the super block + and puts each at a fixed location. The checkpoint data is divided into + two parts -- just enough information to find the IFILE is stored in + two of the super blocks, although it is not toggled between them as in + the Sprite implementation. (This was deliberate, to avoid a single + point of failure.) The remaining checkpoint information is treated as + a regular file, which means that the cleaner info, the segment usage + table and the ifile meta-data are stored in normal log segments. + (Tastes great, less filling...) + +2. The segment layout is radically different in Sprite; this implementation + uses something a lot like network framing, where data/inode blocks are + written asynchronously, and a checksum is used to validate any set of + summary and data/inode blocks. Sprite writes summary blocks synchronously + after the data/inode blocks have been written and the existence of the + summary block validates the data/inode blocks. This permits us to write + everything contiguously, even partial segments and their summaries, whereas + Sprite is forced to seek (from the end of the data inode to the summary + which lives at the end of the segment). Additionally, writing the summary + synchronously should cost about 1/2 a rotation per summary. + +3. Sprite LFS distinguishes between different types of blocks in the segment. + Other than inode blocks and data blocks, we don't. + +4. Sprite LFS traverses the IFILE looking for free blocks. We maintain a + free list threaded through the IFILE entries. + +5. The cleaner runs in user space, as opposed to kernel space. It shares + information with the kernel by reading/writing the IFILE and through + cleaner specific system calls. + diff --git a/sys/ufs/lfs/TODO b/sys/ufs/lfs/TODO new file mode 100644 index 000000000..e86ecdb03 --- /dev/null +++ b/sys/ufs/lfs/TODO @@ -0,0 +1,109 @@ +# $NetBSD: TODO,v 1.10 2005/12/11 12:25:26 christos Exp $ + +- Lock audit. Need to check locking for multiprocessor case in particular. + +- Get rid of lfs_segclean(); the kernel should clean a dirty segment IFF it + has passed two checkpoints containing zero live bytes. + +- Now that our cache is basically all of physical memory, we need to make + sure that segwrite is not starving other important things. Need a way + to prioritize which blocks are most important to write, and write only + those, saving the rest for later. Does this change our notion of what + a checkpoint is? + +- Investigate alternate inode locking strategy: Inode locks are useful + for locking against simultaneous changes to inode size (balloc, + truncate, write) but because the assignment of disk blocks is also + covered by the segment lock, we don't really need to pay attention to + the inode lock when writing a segment, right? If this is true, the + locking problem in lfs_{bmapv,markv} goes away and lfs_reserve can go, + too. + +- Get rid of DEV_BSIZE, pay attention to the media block size at mount time. + +- More fs ops need to call lfs_imtime. Which ones? (Blackwell et al., 1995) + +- lfs_vunref_head exists so that vnodes loaded solely for cleaning can + be put back on the *head* of the vnode free list. Make sure we + actually do this, since we now take IN_CLEANING off during segment write. + +- The cleaner could be enhanced to be controlled from other processes, + and possibly perform additional tasks: + + - Backups. At a minimum, turn the cleaner off and on to allow + effective live backups. More aggressively, the cleaner itself could + be the backup agent, and dump_lfs would merely be a controller. + + - Cleaning time policies. Be able to tweak the cleaner's thresholds + to allow more thorough cleaning during policy-determined idle + periods (regardless of actual idleness) or put off until later + during short, intensive write periods. + + - File coalescing and placement. During periods we expect to be idle, + coalesce fragmented files into one place on disk for better read + performance. Ideally, move files that have not been accessed in a + while to the extremes of the disk, thereby shortening seek times for + files that are accessed more frequently (though how the cleaner + should communicate "please put this near the beginning or end of the + disk" to the kernel is a very good question; flags to lfs_markv?). + + - Versioning. When it cleans a segment it could write data for files + that were less than n versions old to tape or elsewhere. Perhaps it + could even write them back onto the disk, although that requires + more thought (and kernel mods). + +- Move lfs_countlocked() into vfs_bio.c, to replace count_locked_queue; + perhaps keep the name, replace the function. Could it count referenced + vnodes as well, if it was in vfs_subr.c instead? + +- Why not delete the lfs_bmapv call, just mark everything dirty that + isn't deleted/truncated? Get some numbers about what percentage of + the stuff that the cleaner thinks might be live is live. If it's + high, get rid of lfs_bmapv. + +- There is a nasty problem in that it may take *more* room to write the + data to clean a segment than is returned by the new segment because of + indirect blocks in segment 2 being dirtied by the data being copied + into the log from segment 1. The suggested solution at this point is + to detect it when we have no space left on the filesystem, write the + extra data into the last segment (leaving no clean ones), make it a + checkpoint and shut down the file system for fixing by a utility + reading the raw partition. Argument is that this should never happen + and is practically impossible to fix since the cleaner would have to + theoretically build a model of the entire filesystem in memory to + detect the condition occurring. A file coalescing cleaner will help + avoid the problem, and one that reads/writes from the raw disk could + fix it. + +- Need to keep vnode v_numoutput up to date for pending writes? + +- If delete a file that's being executed, the version number isn't + updated, and fsck_lfs has to figure this out; case is the same as if + have an inode that no directory references, so the file should be + reattached into lost+found. + +- Currently there's no notion of write error checking. + + Failed data/inode writes should be rescheduled (kernel level bad blocking). + + Failed superblock writes should cause selection of new superblock + for checkpointing. + +- Future fantasies: + - unrm, versioning + - transactions + - extended cleaner policies (hot/cold data, data placement) + +- Problem with the concept of multiple buffer headers referencing the segment: + Positives: + Don't lock down 1 segment per file system of physical memory. + Don't copy from buffers to segment memory. + Don't tie down the bus to transfer 1M. + Works on controllers supporting less than large transfers. + Disk can start writing immediately instead of waiting 1/2 rotation + and the full transfer. + Negatives: + Have to do segment write then segment summary write, since the latter + is what verifies that the segment is okay. (Is there another way + to do this?) + +- The algorithm for selecting the disk addresses of the super-blocks + has to be available to the user program which checks the file system. diff --git a/include/ufs/lfs/lfs.h b/sys/ufs/lfs/lfs.h similarity index 100% rename from include/ufs/lfs/lfs.h rename to sys/ufs/lfs/lfs.h diff --git a/sys/ufs/lfs/lfs_alloc.c b/sys/ufs/lfs/lfs_alloc.c new file mode 100644 index 000000000..8d2baa01d --- /dev/null +++ b/sys/ufs/lfs/lfs_alloc.c @@ -0,0 +1,674 @@ +/* $NetBSD: lfs_alloc.c,v 1.111 2011/06/12 03:36:01 rmind Exp $ */ + +/*- + * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Konrad E. Schroder . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_alloc.c 8.4 (Berkeley) 1/4/94 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.111 2011/06/12 03:36:01 rmind Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_quota.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +/* Constants for inode free bitmap */ +#define BMSHIFT 5 /* 2 ** 5 = 32 */ +#define BMMASK ((1 << BMSHIFT) - 1) +#define SET_BITMAP_FREE(F, I) do { \ + DLOG((DLOG_ALLOC, "lfs: ino %d wrd %d bit %d set\n", (int)(I), \ + (int)((I) >> BMSHIFT), (int)((I) & BMMASK))); \ + (F)->lfs_ino_bitmap[(I) >> BMSHIFT] |= (1 << ((I) & BMMASK)); \ +} while (0) +#define CLR_BITMAP_FREE(F, I) do { \ + DLOG((DLOG_ALLOC, "lfs: ino %d wrd %d bit %d clr\n", (int)(I), \ + (int)((I) >> BMSHIFT), (int)((I) & BMMASK))); \ + (F)->lfs_ino_bitmap[(I) >> BMSHIFT] &= ~(1 << ((I) & BMMASK)); \ +} while(0) + +#define ISSET_BITMAP_FREE(F, I) \ + ((F)->lfs_ino_bitmap[(I) >> BMSHIFT] & (1 << ((I) & BMMASK))) + +/* + * Add a new block to the Ifile, to accommodate future file creations. + * Called with the segment lock held. + */ +int +lfs_extend_ifile(struct lfs *fs, kauth_cred_t cred) +{ + struct vnode *vp; + struct inode *ip; + IFILE *ifp; + IFILE_V1 *ifp_v1; + struct buf *bp, *cbp; + int error; + daddr_t i, blkno, xmax; + ino_t oldlast, maxino; + CLEANERINFO *cip; + + ASSERT_SEGLOCK(fs); + + vp = fs->lfs_ivnode; + ip = VTOI(vp); + blkno = lblkno(fs, ip->i_size); + if ((error = lfs_balloc(vp, ip->i_size, fs->lfs_bsize, cred, 0, + &bp)) != 0) { + return (error); + } + ip->i_size += fs->lfs_bsize; + ip->i_ffs1_size = ip->i_size; + uvm_vnp_setsize(vp, ip->i_size); + + maxino = ((ip->i_size >> fs->lfs_bshift) - fs->lfs_cleansz - + fs->lfs_segtabsz) * fs->lfs_ifpb; + fs->lfs_ino_bitmap = (lfs_bm_t *) + realloc(fs->lfs_ino_bitmap, ((maxino + BMMASK) >> BMSHIFT) * + sizeof(lfs_bm_t), M_SEGMENT, M_WAITOK); + KASSERT(fs->lfs_ino_bitmap != NULL); + + i = (blkno - fs->lfs_segtabsz - fs->lfs_cleansz) * + fs->lfs_ifpb; + + /* + * We insert the new inodes at the head of the free list. + * Under normal circumstances, the free list is empty here, + * so we are also incidentally placing them at the end (which + * we must do if we are to keep them in order). + */ + LFS_GET_HEADFREE(fs, cip, cbp, &oldlast); + LFS_PUT_HEADFREE(fs, cip, cbp, i); +#ifdef DIAGNOSTIC + if (fs->lfs_freehd == LFS_UNUSED_INUM) + panic("inode 0 allocated [2]"); +#endif /* DIAGNOSTIC */ + xmax = i + fs->lfs_ifpb; + + if (fs->lfs_version == 1) { + for (ifp_v1 = (IFILE_V1 *)bp->b_data; i < xmax; ++ifp_v1) { + SET_BITMAP_FREE(fs, i); + ifp_v1->if_version = 1; + ifp_v1->if_daddr = LFS_UNUSED_DADDR; + ifp_v1->if_nextfree = ++i; + } + ifp_v1--; + ifp_v1->if_nextfree = oldlast; + } else { + for (ifp = (IFILE *)bp->b_data; i < xmax; ++ifp) { + SET_BITMAP_FREE(fs, i); + ifp->if_version = 1; + ifp->if_daddr = LFS_UNUSED_DADDR; + ifp->if_nextfree = ++i; + } + ifp--; + ifp->if_nextfree = oldlast; + } + LFS_PUT_TAILFREE(fs, cip, cbp, xmax - 1); + + (void) LFS_BWRITE_LOG(bp); /* Ifile */ + + return 0; +} + +/* Allocate a new inode. */ +/* ARGSUSED */ +/* VOP_BWRITE 2i times */ +int +lfs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, + struct vnode **vpp) +{ + struct lfs *fs; + struct buf *bp, *cbp; + struct ifile *ifp; + ino_t new_ino; + int error; + int new_gen; + CLEANERINFO *cip; + + fs = VTOI(pvp)->i_lfs; + if (fs->lfs_ronly) + return EROFS; + + ASSERT_NO_SEGLOCK(fs); + + lfs_seglock(fs, SEGM_PROT); + vn_lock(fs->lfs_ivnode, LK_EXCLUSIVE); + + /* Get the head of the freelist. */ + LFS_GET_HEADFREE(fs, cip, cbp, &new_ino); + KASSERT(new_ino != LFS_UNUSED_INUM && new_ino != LFS_IFILE_INUM); + + DLOG((DLOG_ALLOC, "lfs_valloc: allocate inode %lld\n", + (long long)new_ino)); + + /* + * Remove the inode from the free list and write the new start + * of the free list into the superblock. + */ + CLR_BITMAP_FREE(fs, new_ino); + LFS_IENTRY(ifp, fs, new_ino, bp); + if (ifp->if_daddr != LFS_UNUSED_DADDR) + panic("lfs_valloc: inuse inode %llu on the free list", + (unsigned long long)new_ino); + LFS_PUT_HEADFREE(fs, cip, cbp, ifp->if_nextfree); + DLOG((DLOG_ALLOC, "lfs_valloc: headfree %lld -> %lld\n", + (long long)new_ino, (long long)ifp->if_nextfree)); + + new_gen = ifp->if_version; /* version was updated by vfree */ + brelse(bp, 0); + + /* Extend IFILE so that the next lfs_valloc will succeed. */ + if (fs->lfs_freehd == LFS_UNUSED_INUM) { + if ((error = lfs_extend_ifile(fs, cred)) != 0) { + LFS_PUT_HEADFREE(fs, cip, cbp, new_ino); + VOP_UNLOCK(fs->lfs_ivnode); + lfs_segunlock(fs); + return error; + } + } +#ifdef DIAGNOSTIC + if (fs->lfs_freehd == LFS_UNUSED_INUM) + panic("inode 0 allocated [3]"); +#endif /* DIAGNOSTIC */ + + /* Set superblock modified bit and increment file count. */ + mutex_enter(&lfs_lock); + fs->lfs_fmod = 1; + mutex_exit(&lfs_lock); + ++fs->lfs_nfiles; + + VOP_UNLOCK(fs->lfs_ivnode); + lfs_segunlock(fs); + + return lfs_ialloc(fs, pvp, new_ino, new_gen, vpp); +} + +/* + * Finish allocating a new inode, given an inode and generation number. + */ +int +lfs_ialloc(struct lfs *fs, struct vnode *pvp, ino_t new_ino, int new_gen, + struct vnode **vpp) +{ + struct inode *ip; + struct vnode *vp; + + ASSERT_NO_SEGLOCK(fs); + + vp = *vpp; + mutex_enter(&ufs_hashlock); + /* Create an inode to associate with the vnode. */ + lfs_vcreate(pvp->v_mount, new_ino, vp); + + ip = VTOI(vp); + mutex_enter(&lfs_lock); + LFS_SET_UINO(ip, IN_CHANGE); + mutex_exit(&lfs_lock); + /* on-disk structure has been zeroed out by lfs_vcreate */ + ip->i_din.ffs1_din->di_inumber = new_ino; + + /* Note no blocks yet */ + ip->i_lfs_hiblk = -1; + + /* Set a new generation number for this inode. */ + if (new_gen) { + ip->i_gen = new_gen; + ip->i_ffs1_gen = new_gen; + } + + /* Insert into the inode hash table. */ + ufs_ihashins(ip); + mutex_exit(&ufs_hashlock); + + ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, vpp); + vp = *vpp; + ip = VTOI(vp); + + memset(ip->i_lfs_fragsize, 0, NDADDR * sizeof(*ip->i_lfs_fragsize)); + + uvm_vnp_setsize(vp, 0); + lfs_mark_vnode(vp); + genfs_node_init(vp, &lfs_genfsops); + vref(ip->i_devvp); + return (0); +} + +/* Create a new vnode/inode pair and initialize what fields we can. */ +void +lfs_vcreate(struct mount *mp, ino_t ino, struct vnode *vp) +{ + struct inode *ip; + struct ufs1_dinode *dp; + struct ufsmount *ump; + + /* Get a pointer to the private mount structure. */ + ump = VFSTOUFS(mp); + + ASSERT_NO_SEGLOCK(ump->um_lfs); + + /* Initialize the inode. */ + ip = pool_get(&lfs_inode_pool, PR_WAITOK); + memset(ip, 0, sizeof(*ip)); + dp = pool_get(&lfs_dinode_pool, PR_WAITOK); + memset(dp, 0, sizeof(*dp)); + ip->inode_ext.lfs = pool_get(&lfs_inoext_pool, PR_WAITOK); + memset(ip->inode_ext.lfs, 0, sizeof(*ip->inode_ext.lfs)); + vp->v_data = ip; + ip->i_din.ffs1_din = dp; + ip->i_ump = ump; + ip->i_vnode = vp; + ip->i_devvp = ump->um_devvp; + ip->i_dev = ump->um_dev; + ip->i_number = dp->di_inumber = ino; + ip->i_lfs = ump->um_lfs; + ip->i_lfs_effnblks = 0; + SPLAY_INIT(&ip->i_lfs_lbtree); + ip->i_lfs_nbtree = 0; + LIST_INIT(&ip->i_lfs_segdhd); +#ifdef QUOTA + ufsquota_init(ip); +#endif +} + +#if 0 +/* + * Find the highest-numbered allocated inode. + * This will be used to shrink the Ifile. + */ +static inline ino_t +lfs_last_alloc_ino(struct lfs *fs) +{ + ino_t ino, maxino; + + maxino = ((fs->lfs_ivnode->v_size >> fs->lfs_bshift) - + fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb; + for (ino = maxino - 1; ino > LFS_UNUSED_INUM; --ino) { + if (ISSET_BITMAP_FREE(fs, ino) == 0) + break; + } + return ino; +} +#endif + +/* + * Find the previous (next lowest numbered) free inode, if any. + * If there is none, return LFS_UNUSED_INUM. + */ +static inline ino_t +lfs_freelist_prev(struct lfs *fs, ino_t ino) +{ + ino_t tino, bound, bb, freehdbb; + + if (fs->lfs_freehd == LFS_UNUSED_INUM) /* No free inodes at all */ + return LFS_UNUSED_INUM; + + /* Search our own word first */ + bound = ino & ~BMMASK; + for (tino = ino - 1; tino >= bound && tino > LFS_UNUSED_INUM; tino--) + if (ISSET_BITMAP_FREE(fs, tino)) + return tino; + /* If there are no lower words to search, just return */ + if (ino >> BMSHIFT == 0) + return LFS_UNUSED_INUM; + + /* + * Find a word with a free inode in it. We have to be a bit + * careful here since ino_t is unsigned. + */ + freehdbb = (fs->lfs_freehd >> BMSHIFT); + for (bb = (ino >> BMSHIFT) - 1; bb >= freehdbb && bb > 0; --bb) + if (fs->lfs_ino_bitmap[bb]) + break; + if (fs->lfs_ino_bitmap[bb] == 0) + return LFS_UNUSED_INUM; + + /* Search the word we found */ + for (tino = (bb << BMSHIFT) | BMMASK; tino >= (bb << BMSHIFT) && + tino > LFS_UNUSED_INUM; tino--) + if (ISSET_BITMAP_FREE(fs, tino)) + break; + + if (tino <= LFS_IFILE_INUM) + tino = LFS_UNUSED_INUM; + + return tino; +} + +/* Free an inode. */ +/* ARGUSED */ +/* VOP_BWRITE 2i times */ +int +lfs_vfree(struct vnode *vp, ino_t ino, int mode) +{ + SEGUSE *sup; + CLEANERINFO *cip; + struct buf *cbp, *bp; + struct ifile *ifp; + struct inode *ip; + struct lfs *fs; + daddr_t old_iaddr; + ino_t otail; + + /* Get the inode number and file system. */ + ip = VTOI(vp); + fs = ip->i_lfs; + ino = ip->i_number; + + ASSERT_NO_SEGLOCK(fs); + DLOG((DLOG_ALLOC, "lfs_vfree: free ino %lld\n", (long long)ino)); + + /* Drain of pending writes */ + mutex_enter(vp->v_interlock); + while (fs->lfs_version > 1 && WRITEINPROG(vp)) { + cv_wait(&vp->v_cv, vp->v_interlock); + } + mutex_exit(vp->v_interlock); + + lfs_seglock(fs, SEGM_PROT); + vn_lock(fs->lfs_ivnode, LK_EXCLUSIVE); + + lfs_unmark_vnode(vp); + mutex_enter(&lfs_lock); + if (vp->v_uflag & VU_DIROP) { + vp->v_uflag &= ~VU_DIROP; + --lfs_dirvcount; + --fs->lfs_dirvcount; + TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain); + wakeup(&fs->lfs_dirvcount); + wakeup(&lfs_dirvcount); + mutex_exit(&lfs_lock); + lfs_vunref(vp); + + /* + * If this inode is not going to be written any more, any + * segment accounting left over from its truncation needs + * to occur at the end of the next dirops flush. Attach + * them to the fs-wide list for that purpose. + */ + if (LIST_FIRST(&ip->i_lfs_segdhd) != NULL) { + struct segdelta *sd; + + while((sd = LIST_FIRST(&ip->i_lfs_segdhd)) != NULL) { + LIST_REMOVE(sd, list); + LIST_INSERT_HEAD(&fs->lfs_segdhd, sd, list); + } + } + } else { + /* + * If it's not a dirop, we can finalize right away. + */ + mutex_exit(&lfs_lock); + lfs_finalize_ino_seguse(fs, ip); + } + + mutex_enter(&lfs_lock); + LFS_CLR_UINO(ip, IN_ACCESSED|IN_CLEANING|IN_MODIFIED); + mutex_exit(&lfs_lock); + ip->i_flag &= ~IN_ALLMOD; + ip->i_lfs_iflags |= LFSI_DELETED; + + /* + * Set the ifile's inode entry to unused, increment its version number + * and link it onto the free chain. + */ + SET_BITMAP_FREE(fs, ino); + LFS_IENTRY(ifp, fs, ino, bp); + old_iaddr = ifp->if_daddr; + ifp->if_daddr = LFS_UNUSED_DADDR; + ++ifp->if_version; + if (fs->lfs_version == 1) { + LFS_GET_HEADFREE(fs, cip, cbp, &(ifp->if_nextfree)); + LFS_PUT_HEADFREE(fs, cip, cbp, ino); + (void) LFS_BWRITE_LOG(bp); /* Ifile */ + } else { + ino_t tino, onf; + + ifp->if_nextfree = LFS_UNUSED_INUM; + (void) LFS_BWRITE_LOG(bp); /* Ifile */ + + tino = lfs_freelist_prev(fs, ino); + if (tino == LFS_UNUSED_INUM) { + /* Nothing free below us, put us on the head */ + LFS_IENTRY(ifp, fs, ino, bp); + LFS_GET_HEADFREE(fs, cip, cbp, &(ifp->if_nextfree)); + LFS_PUT_HEADFREE(fs, cip, cbp, ino); + DLOG((DLOG_ALLOC, "lfs_vfree: headfree %lld -> %lld\n", + (long long)ifp->if_nextfree, (long long)ino)); + LFS_BWRITE_LOG(bp); /* Ifile */ + + /* If the list was empty, set tail too */ + LFS_GET_TAILFREE(fs, cip, cbp, &otail); + if (otail == LFS_UNUSED_INUM) { + LFS_PUT_TAILFREE(fs, cip, cbp, ino); + DLOG((DLOG_ALLOC, "lfs_vfree: tailfree %lld " + "-> %lld\n", (long long)otail, + (long long)ino)); + } + } else { + /* + * Insert this inode into the list after tino. + * We hold the segment lock so we don't have to + * worry about blocks being written out of order. + */ + DLOG((DLOG_ALLOC, "lfs_vfree: insert ino %lld " + " after %lld\n", ino, tino)); + + LFS_IENTRY(ifp, fs, tino, bp); + onf = ifp->if_nextfree; + ifp->if_nextfree = ino; + LFS_BWRITE_LOG(bp); /* Ifile */ + + LFS_IENTRY(ifp, fs, ino, bp); + ifp->if_nextfree = onf; + LFS_BWRITE_LOG(bp); /* Ifile */ + + /* If we're last, put us on the tail */ + if (onf == LFS_UNUSED_INUM) { + LFS_GET_TAILFREE(fs, cip, cbp, &otail); + LFS_PUT_TAILFREE(fs, cip, cbp, ino); + DLOG((DLOG_ALLOC, "lfs_vfree: tailfree %lld " + "-> %lld\n", (long long)otail, + (long long)ino)); + } + } + } +#ifdef DIAGNOSTIC + if (ino == LFS_UNUSED_INUM) { + panic("inode 0 freed"); + } +#endif /* DIAGNOSTIC */ + if (old_iaddr != LFS_UNUSED_DADDR) { + LFS_SEGENTRY(sup, fs, dtosn(fs, old_iaddr), bp); +#ifdef DIAGNOSTIC + if (sup->su_nbytes < sizeof (struct ufs1_dinode)) { + printf("lfs_vfree: negative byte count" + " (segment %" PRIu32 " short by %d)\n", + dtosn(fs, old_iaddr), + (int)sizeof (struct ufs1_dinode) - + sup->su_nbytes); + panic("lfs_vfree: negative byte count"); + sup->su_nbytes = sizeof (struct ufs1_dinode); + } +#endif + sup->su_nbytes -= sizeof (struct ufs1_dinode); + LFS_WRITESEGENTRY(sup, fs, dtosn(fs, old_iaddr), bp); /* Ifile */ + } + + /* Set superblock modified bit and decrement file count. */ + mutex_enter(&lfs_lock); + fs->lfs_fmod = 1; + mutex_exit(&lfs_lock); + --fs->lfs_nfiles; + + VOP_UNLOCK(fs->lfs_ivnode); + lfs_segunlock(fs); + + return (0); +} + +/* + * Sort the freelist and set up the free-inode bitmap. + * To be called by lfs_mountfs(). + */ +void +lfs_order_freelist(struct lfs *fs) +{ + CLEANERINFO *cip; + IFILE *ifp = NULL; + struct buf *bp; + ino_t ino, firstino, lastino, maxino; +#ifdef notyet + struct vnode *vp; +#endif + + ASSERT_NO_SEGLOCK(fs); + lfs_seglock(fs, SEGM_PROT); + + maxino = ((fs->lfs_ivnode->v_size >> fs->lfs_bshift) - + fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb; + fs->lfs_ino_bitmap = (lfs_bm_t *) + malloc(((maxino + BMMASK) >> BMSHIFT) * sizeof(lfs_bm_t), + M_SEGMENT, M_WAITOK | M_ZERO); + KASSERT(fs->lfs_ino_bitmap != NULL); + + firstino = lastino = LFS_UNUSED_INUM; + for (ino = 0; ino < maxino; ino++) { + if (ino % fs->lfs_ifpb == 0) + LFS_IENTRY(ifp, fs, ino, bp); + else + ++ifp; + + /* Don't put zero or ifile on the free list */ + if (ino == LFS_UNUSED_INUM || ino == LFS_IFILE_INUM) + continue; + +#ifdef notyet + /* Address orphaned files */ + if (ifp->if_nextfree == LFS_ORPHAN_NEXTFREE && + VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp) == 0) { + lfs_truncate(vp, 0, 0, NOCRED); + vput(vp); + LFS_SEGENTRY(sup, fs, dtosn(fs, ifp->if_daddr), bp); + KASSERT(sup->su_nbytes >= DINODE1_SIZE); + sup->su_nbytes -= DINODE1_SIZE; + LFS_WRITESEGENTRY(sup, fs, dtosn(fs, ifp->if_daddr), bp); + + /* Set up to fall through to next section */ + ifp->if_daddr = LFS_UNUSED_DADDR; + LFS_BWRITE_LOG(bp); + LFS_IENTRY(ifp, fs, ino, bp); + } +#endif + + if (ifp->if_daddr == LFS_UNUSED_DADDR) { + if (firstino == LFS_UNUSED_INUM) + firstino = ino; + else { + brelse(bp, 0); + + LFS_IENTRY(ifp, fs, lastino, bp); + ifp->if_nextfree = ino; + LFS_BWRITE_LOG(bp); + + LFS_IENTRY(ifp, fs, ino, bp); + } + lastino = ino; + + SET_BITMAP_FREE(fs, ino); + } + + if ((ino + 1) % fs->lfs_ifpb == 0) + brelse(bp, 0); + } + + LFS_PUT_HEADFREE(fs, cip, bp, firstino); + LFS_PUT_TAILFREE(fs, cip, bp, lastino); + + lfs_segunlock(fs); +} + +void +lfs_orphan(struct lfs *fs, ino_t ino) +{ + IFILE *ifp; + struct buf *bp; + + LFS_IENTRY(ifp, fs, ino, bp); + ifp->if_nextfree = LFS_ORPHAN_NEXTFREE; + LFS_BWRITE_LOG(bp); +} diff --git a/sys/ufs/lfs/lfs_balloc.c b/sys/ufs/lfs/lfs_balloc.c new file mode 100644 index 000000000..d46ba0570 --- /dev/null +++ b/sys/ufs/lfs/lfs_balloc.c @@ -0,0 +1,582 @@ +/* $NetBSD: lfs_balloc.c,v 1.70 2011/07/11 08:27:40 hannken Exp $ */ + +/*- + * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Konrad E. Schroder . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_balloc.c 8.4 (Berkeley) 5/8/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: lfs_balloc.c,v 1.70 2011/07/11 08:27:40 hannken Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_quota.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include + +#include + +int lfs_fragextend(struct vnode *, int, int, daddr_t, struct buf **, kauth_cred_t); + +u_int64_t locked_fakequeue_count; + +/* + * Allocate a block, and to inode and filesystem block accounting for it + * and for any indirect blocks the may need to be created in order for + * this block to be created. + * + * Blocks which have never been accounted for (i.e., which "do not exist") + * have disk address 0, which is translated by ufs_bmap to the special value + * UNASSIGNED == -1, as in the historical UFS. + * + * Blocks which have been accounted for but which have not yet been written + * to disk are given the new special disk address UNWRITTEN == -2, so that + * they can be differentiated from completely new blocks. + */ +/* VOP_BWRITE NIADDR+2 times */ +int +lfs_balloc(struct vnode *vp, off_t startoffset, int iosize, kauth_cred_t cred, + int flags, struct buf **bpp) +{ + int offset; + daddr_t daddr, idaddr; + struct buf *ibp, *bp; + struct inode *ip; + struct lfs *fs; + struct indir indirs[NIADDR+2], *idp; + daddr_t lbn, lastblock; + int bcount; + int error, frags, i, nsize, osize, num; + + ip = VTOI(vp); + fs = ip->i_lfs; + offset = blkoff(fs, startoffset); + KASSERT(iosize <= fs->lfs_bsize); + lbn = lblkno(fs, startoffset); + /* (void)lfs_check(vp, lbn, 0); */ + + ASSERT_MAYBE_SEGLOCK(fs); + + /* + * Three cases: it's a block beyond the end of file, it's a block in + * the file that may or may not have been assigned a disk address or + * we're writing an entire block. + * + * Note, if the daddr is UNWRITTEN, the block already exists in + * the cache (it was read or written earlier). If so, make sure + * we don't count it as a new block or zero out its contents. If + * it did not, make sure we allocate any necessary indirect + * blocks. + * + * If we are writing a block beyond the end of the file, we need to + * check if the old last block was a fragment. If it was, we need + * to rewrite it. + */ + + if (bpp) + *bpp = NULL; + + /* Check for block beyond end of file and fragment extension needed. */ + lastblock = lblkno(fs, ip->i_size); + if (lastblock < NDADDR && lastblock < lbn) { + osize = blksize(fs, ip, lastblock); + if (osize < fs->lfs_bsize && osize > 0) { + if ((error = lfs_fragextend(vp, osize, fs->lfs_bsize, + lastblock, + (bpp ? &bp : NULL), cred))) + return (error); + ip->i_ffs1_size = ip->i_size = + (lastblock + 1) * fs->lfs_bsize; + uvm_vnp_setsize(vp, ip->i_size); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (bpp) + (void) VOP_BWRITE(bp->b_vp, bp); + } + } + + /* + * If the block we are writing is a direct block, it's the last + * block in the file, and offset + iosize is less than a full + * block, we can write one or more fragments. There are two cases: + * the block is brand new and we should allocate it the correct + * size or it already exists and contains some fragments and + * may need to extend it. + */ + if (lbn < NDADDR && lblkno(fs, ip->i_size) <= lbn) { + osize = blksize(fs, ip, lbn); + nsize = fragroundup(fs, offset + iosize); + if (lblktosize(fs, lbn) >= ip->i_size) { + /* Brand new block or fragment */ + frags = numfrags(fs, nsize); + if (!ISSPACE(fs, frags, cred)) + return ENOSPC; + if (bpp) { + *bpp = bp = getblk(vp, lbn, nsize, 0, 0); + bp->b_blkno = UNWRITTEN; + if (flags & B_CLRBUF) + clrbuf(bp); + } + ip->i_lfs_effnblks += frags; + mutex_enter(&lfs_lock); + fs->lfs_bfree -= frags; + mutex_exit(&lfs_lock); + ip->i_ffs1_db[lbn] = UNWRITTEN; + } else { + if (nsize <= osize) { + /* No need to extend */ + if (bpp && (error = bread(vp, lbn, osize, + NOCRED, 0, &bp))) + return error; + } else { + /* Extend existing block */ + if ((error = + lfs_fragextend(vp, osize, nsize, lbn, + (bpp ? &bp : NULL), cred))) + return error; + } + if (bpp) + *bpp = bp; + } + return 0; + } + + error = ufs_bmaparray(vp, lbn, &daddr, &indirs[0], &num, NULL, NULL); + if (error) + return (error); + + daddr = (daddr_t)((int32_t)daddr); /* XXX ondisk32 */ + KASSERT(daddr <= LFS_MAX_DADDR); + + /* + * Do byte accounting all at once, so we can gracefully fail *before* + * we start assigning blocks. + */ + frags = VFSTOUFS(vp->v_mount)->um_seqinc; + bcount = 0; + if (daddr == UNASSIGNED) { + bcount = frags; + } + for (i = 1; i < num; ++i) { + if (!indirs[i].in_exists) { + bcount += frags; + } + } + if (ISSPACE(fs, bcount, cred)) { + mutex_enter(&lfs_lock); + fs->lfs_bfree -= bcount; + mutex_exit(&lfs_lock); + ip->i_lfs_effnblks += bcount; + } else { + return ENOSPC; + } + + if (daddr == UNASSIGNED) { + if (num > 0 && ip->i_ffs1_ib[indirs[0].in_off] == 0) { + ip->i_ffs1_ib[indirs[0].in_off] = UNWRITTEN; + } + + /* + * Create new indirect blocks if necessary + */ + if (num > 1) { + idaddr = ip->i_ffs1_ib[indirs[0].in_off]; + for (i = 1; i < num; ++i) { + ibp = getblk(vp, indirs[i].in_lbn, + fs->lfs_bsize, 0,0); + if (!indirs[i].in_exists) { + clrbuf(ibp); + ibp->b_blkno = UNWRITTEN; + } else if (!(ibp->b_oflags & (BO_DELWRI | BO_DONE))) { + ibp->b_blkno = fsbtodb(fs, idaddr); + ibp->b_flags |= B_READ; + VOP_STRATEGY(vp, ibp); + biowait(ibp); + } + /* + * This block exists, but the next one may not. + * If that is the case mark it UNWRITTEN to keep + * the accounting straight. + */ + /* XXX ondisk32 */ + if (((int32_t *)ibp->b_data)[indirs[i].in_off] == 0) + ((int32_t *)ibp->b_data)[indirs[i].in_off] = + UNWRITTEN; + /* XXX ondisk32 */ + idaddr = ((int32_t *)ibp->b_data)[indirs[i].in_off]; +#ifdef DEBUG + if (vp == fs->lfs_ivnode) { + LFS_ENTER_LOG("balloc", __FILE__, + __LINE__, indirs[i].in_lbn, + ibp->b_flags, curproc->p_pid); + } +#endif + if ((error = VOP_BWRITE(ibp->b_vp, ibp))) + return error; + } + } + } + + + /* + * Get the existing block from the cache, if requested. + */ + if (bpp) + *bpp = bp = getblk(vp, lbn, blksize(fs, ip, lbn), 0, 0); + + /* + * Do accounting on blocks that represent pages. + */ + if (!bpp) + lfs_register_block(vp, lbn); + + /* + * The block we are writing may be a brand new block + * in which case we need to do accounting. + * + * We can tell a truly new block because ufs_bmaparray will say + * it is UNASSIGNED. Once we allocate it we will assign it the + * disk address UNWRITTEN. + */ + if (daddr == UNASSIGNED) { + if (bpp) { + if (flags & B_CLRBUF) + clrbuf(bp); + + /* Note the new address */ + bp->b_blkno = UNWRITTEN; + } + + switch (num) { + case 0: + ip->i_ffs1_db[lbn] = UNWRITTEN; + break; + case 1: + ip->i_ffs1_ib[indirs[0].in_off] = UNWRITTEN; + break; + default: + idp = &indirs[num - 1]; + if (bread(vp, idp->in_lbn, fs->lfs_bsize, NOCRED, + B_MODIFY, &ibp)) + panic("lfs_balloc: bread bno %lld", + (long long)idp->in_lbn); + /* XXX ondisk32 */ + ((int32_t *)ibp->b_data)[idp->in_off] = UNWRITTEN; +#ifdef DEBUG + if (vp == fs->lfs_ivnode) { + LFS_ENTER_LOG("balloc", __FILE__, + __LINE__, idp->in_lbn, + ibp->b_flags, curproc->p_pid); + } +#endif + VOP_BWRITE(ibp->b_vp, ibp); + } + } else if (bpp && !(bp->b_oflags & (BO_DONE|BO_DELWRI))) { + /* + * Not a brand new block, also not in the cache; + * read it in from disk. + */ + if (iosize == fs->lfs_bsize) + /* Optimization: I/O is unnecessary. */ + bp->b_blkno = daddr; + else { + /* + * We need to read the block to preserve the + * existing bytes. + */ + bp->b_blkno = daddr; + bp->b_flags |= B_READ; + VOP_STRATEGY(vp, bp); + return (biowait(bp)); + } + } + + return (0); +} + +/* VOP_BWRITE 1 time */ +int +lfs_fragextend(struct vnode *vp, int osize, int nsize, daddr_t lbn, struct buf **bpp, + kauth_cred_t cred) +{ + struct inode *ip; + struct lfs *fs; + long frags; + int error; + extern long locked_queue_bytes; + size_t obufsize; + + ip = VTOI(vp); + fs = ip->i_lfs; + frags = (long)numfrags(fs, nsize - osize); + error = 0; + + ASSERT_NO_SEGLOCK(fs); + + /* + * Get the seglock so we don't enlarge blocks while a segment + * is being written. If we're called with bpp==NULL, though, + * we are only pretending to change a buffer, so we don't have to + * lock. + */ + top: + if (bpp) { + rw_enter(&fs->lfs_fraglock, RW_READER); + LFS_DEBUG_COUNTLOCKED("frag"); + } + + if (!ISSPACE(fs, frags, cred)) { + error = ENOSPC; + goto out; + } + + /* + * If we are not asked to actually return the block, all we need + * to do is allocate space for it. UBC will handle dirtying the + * appropriate things and making sure it all goes to disk. + * Don't bother to read in that case. + */ + if (bpp && (error = bread(vp, lbn, osize, NOCRED, 0, bpp))) { + brelse(*bpp, 0); + goto out; + } +#ifdef QUOTA + if ((error = chkdq(ip, frags, cred, 0))) { + if (bpp) + brelse(*bpp, 0); + goto out; + } +#endif + /* + * Adjust accounting for lfs_avail. If there's not enough room, + * we will have to wait for the cleaner, which we can't do while + * holding a block busy or while holding the seglock. In that case, + * release both and start over after waiting. + */ + + if (bpp && ((*bpp)->b_oflags & BO_DELWRI)) { + if (!lfs_fits(fs, frags)) { + if (bpp) + brelse(*bpp, 0); +#ifdef QUOTA + chkdq(ip, -frags, cred, 0); +#endif + rw_exit(&fs->lfs_fraglock); + lfs_availwait(fs, frags); + goto top; + } + fs->lfs_avail -= frags; + } + + mutex_enter(&lfs_lock); + fs->lfs_bfree -= frags; + mutex_exit(&lfs_lock); + ip->i_lfs_effnblks += frags; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + + if (bpp) { + obufsize = (*bpp)->b_bufsize; + allocbuf(*bpp, nsize, 1); + + /* Adjust locked-list accounting */ + if (((*bpp)->b_flags & B_LOCKED) != 0 && + (*bpp)->b_iodone == NULL) { + mutex_enter(&lfs_lock); + locked_queue_bytes += (*bpp)->b_bufsize - obufsize; + mutex_exit(&lfs_lock); + } + + memset((char *)((*bpp)->b_data) + osize, 0, (u_int)(nsize - osize)); + } + + out: + if (bpp) { + rw_exit(&fs->lfs_fraglock); + } + return (error); +} + +static inline int +lge(struct lbnentry *a, struct lbnentry *b) +{ + return a->lbn - b->lbn; +} + +SPLAY_PROTOTYPE(lfs_splay, lbnentry, entry, lge); + +SPLAY_GENERATE(lfs_splay, lbnentry, entry, lge); + +/* + * Record this lbn as being "write pending". We used to have this information + * on the buffer headers, but since pages don't have buffer headers we + * record it here instead. + */ +void +lfs_register_block(struct vnode *vp, daddr_t lbn) +{ + struct lfs *fs; + struct inode *ip; + struct lbnentry *lbp; + + ip = VTOI(vp); + + /* Don't count metadata */ + if (lbn < 0 || vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM) + return; + + fs = ip->i_lfs; + + ASSERT_NO_SEGLOCK(fs); + + /* If no space, wait for the cleaner */ + lfs_availwait(fs, btofsb(fs, 1 << fs->lfs_bshift)); + + lbp = (struct lbnentry *)pool_get(&lfs_lbnentry_pool, PR_WAITOK); + lbp->lbn = lbn; + mutex_enter(&lfs_lock); + if (SPLAY_INSERT(lfs_splay, &ip->i_lfs_lbtree, lbp) != NULL) { + mutex_exit(&lfs_lock); + /* Already there */ + pool_put(&lfs_lbnentry_pool, lbp); + return; + } + + ++ip->i_lfs_nbtree; + fs->lfs_favail += btofsb(fs, (1 << fs->lfs_bshift)); + fs->lfs_pages += fs->lfs_bsize >> PAGE_SHIFT; + ++locked_fakequeue_count; + lfs_subsys_pages += fs->lfs_bsize >> PAGE_SHIFT; + mutex_exit(&lfs_lock); +} + +static void +lfs_do_deregister(struct lfs *fs, struct inode *ip, struct lbnentry *lbp) +{ + ASSERT_MAYBE_SEGLOCK(fs); + + mutex_enter(&lfs_lock); + --ip->i_lfs_nbtree; + SPLAY_REMOVE(lfs_splay, &ip->i_lfs_lbtree, lbp); + if (fs->lfs_favail > btofsb(fs, (1 << fs->lfs_bshift))) + fs->lfs_favail -= btofsb(fs, (1 << fs->lfs_bshift)); + fs->lfs_pages -= fs->lfs_bsize >> PAGE_SHIFT; + if (locked_fakequeue_count > 0) + --locked_fakequeue_count; + lfs_subsys_pages -= fs->lfs_bsize >> PAGE_SHIFT; + mutex_exit(&lfs_lock); + + pool_put(&lfs_lbnentry_pool, lbp); +} + +void +lfs_deregister_block(struct vnode *vp, daddr_t lbn) +{ + struct lfs *fs; + struct inode *ip; + struct lbnentry *lbp; + struct lbnentry tmp; + + ip = VTOI(vp); + + /* Don't count metadata */ + if (lbn < 0 || vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM) + return; + + fs = ip->i_lfs; + tmp.lbn = lbn; + lbp = SPLAY_FIND(lfs_splay, &ip->i_lfs_lbtree, &tmp); + if (lbp == NULL) + return; + + lfs_do_deregister(fs, ip, lbp); +} + +void +lfs_deregister_all(struct vnode *vp) +{ + struct lbnentry *lbp, *nlbp; + struct lfs_splay *hd; + struct lfs *fs; + struct inode *ip; + + ip = VTOI(vp); + fs = ip->i_lfs; + hd = &ip->i_lfs_lbtree; + + for (lbp = SPLAY_MIN(lfs_splay, hd); lbp != NULL; lbp = nlbp) { + nlbp = SPLAY_NEXT(lfs_splay, hd, lbp); + lfs_do_deregister(fs, ip, lbp); + } +} diff --git a/sys/ufs/lfs/lfs_bio.c b/sys/ufs/lfs/lfs_bio.c new file mode 100644 index 000000000..fe3d4b52e --- /dev/null +++ b/sys/ufs/lfs/lfs_bio.c @@ -0,0 +1,858 @@ +/* $NetBSD: lfs_bio.c,v 1.120 2011/07/11 08:27:40 hannken Exp $ */ + +/*- + * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Konrad E. Schroder . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_bio.c 8.10 (Berkeley) 6/10/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.120 2011/07/11 08:27:40 hannken Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include + +/* + * LFS block write function. + * + * XXX + * No write cost accounting is done. + * This is almost certainly wrong for synchronous operations and NFS. + * + * protected by lfs_lock. + */ +int locked_queue_count = 0; /* Count of locked-down buffers. */ +long locked_queue_bytes = 0L; /* Total size of locked buffers. */ +int lfs_subsys_pages = 0L; /* Total number LFS-written pages */ +int lfs_fs_pagetrip = 0; /* # of pages to trip per-fs write */ +int lfs_writing = 0; /* Set if already kicked off a writer + because of buffer space */ + +/* Lock and condition variables for above. */ +kcondvar_t locked_queue_cv; +kcondvar_t lfs_writing_cv; +kmutex_t lfs_lock; + +extern int lfs_dostats; + +/* + * reserved number/bytes of locked buffers + */ +int locked_queue_rcount = 0; +long locked_queue_rbytes = 0L; + +static int lfs_fits_buf(struct lfs *, int, int); +static int lfs_reservebuf(struct lfs *, struct vnode *vp, struct vnode *vp2, + int, int); +static int lfs_reserveavail(struct lfs *, struct vnode *vp, struct vnode *vp2, + int); + +static int +lfs_fits_buf(struct lfs *fs, int n, int bytes) +{ + int count_fit, bytes_fit; + + ASSERT_NO_SEGLOCK(fs); + KASSERT(mutex_owned(&lfs_lock)); + + count_fit = + (locked_queue_count + locked_queue_rcount + n <= LFS_WAIT_BUFS); + bytes_fit = + (locked_queue_bytes + locked_queue_rbytes + bytes <= LFS_WAIT_BYTES); + +#ifdef DEBUG + if (!count_fit) { + DLOG((DLOG_AVAIL, "lfs_fits_buf: no fit count: %d + %d + %d >= %d\n", + locked_queue_count, locked_queue_rcount, + n, LFS_WAIT_BUFS)); + } + if (!bytes_fit) { + DLOG((DLOG_AVAIL, "lfs_fits_buf: no fit bytes: %ld + %ld + %d >= %ld\n", + locked_queue_bytes, locked_queue_rbytes, + bytes, LFS_WAIT_BYTES)); + } +#endif /* DEBUG */ + + return (count_fit && bytes_fit); +} + +/* ARGSUSED */ +static int +lfs_reservebuf(struct lfs *fs, struct vnode *vp, + struct vnode *vp2, int n, int bytes) +{ + ASSERT_MAYBE_SEGLOCK(fs); + KASSERT(locked_queue_rcount >= 0); + KASSERT(locked_queue_rbytes >= 0); + + mutex_enter(&lfs_lock); + while (n > 0 && !lfs_fits_buf(fs, n, bytes)) { + int error; + + lfs_flush(fs, 0, 0); + + error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock, + hz * LFS_BUFWAIT); + if (error && error != EWOULDBLOCK) { + mutex_exit(&lfs_lock); + return error; + } + } + + locked_queue_rcount += n; + locked_queue_rbytes += bytes; + + if (n < 0) + cv_broadcast(&locked_queue_cv); + + mutex_exit(&lfs_lock); + + KASSERT(locked_queue_rcount >= 0); + KASSERT(locked_queue_rbytes >= 0); + + return 0; +} + +/* + * Try to reserve some blocks, prior to performing a sensitive operation that + * requires the vnode lock to be honored. If there is not enough space, give + * up the vnode lock temporarily and wait for the space to become available. + * + * Called with vp locked. (Note nowever that if fsb < 0, vp is ignored.) + * + * XXX YAMT - it isn't safe to unlock vp here + * because the node might be modified while we sleep. + * (eg. cached states like i_offset might be stale, + * the vnode might be truncated, etc..) + * maybe we should have a way to restart the vnodeop (EVOPRESTART?) + * or rearrange vnodeop interface to leave vnode locking to file system + * specific code so that each file systems can have their own vnode locking and + * vnode re-using strategies. + */ +static int +lfs_reserveavail(struct lfs *fs, struct vnode *vp, + struct vnode *vp2, int fsb) +{ + CLEANERINFO *cip; + struct buf *bp; + int error, slept; + + ASSERT_MAYBE_SEGLOCK(fs); + slept = 0; + mutex_enter(&lfs_lock); + while (fsb > 0 && !lfs_fits(fs, fsb + fs->lfs_ravail + fs->lfs_favail)) { + mutex_exit(&lfs_lock); +#if 0 + /* + * XXX ideally, we should unlock vnodes here + * because we might sleep very long time. + */ + VOP_UNLOCK(vp); + if (vp2 != NULL) { + VOP_UNLOCK(vp2); + } +#else + /* + * XXX since we'll sleep for cleaner with vnode lock holding, + * deadlock will occur if cleaner tries to lock the vnode. + * (eg. lfs_markv -> lfs_fastvget -> getnewvnode -> vclean) + */ +#endif + + if (!slept) { + DLOG((DLOG_AVAIL, "lfs_reserve: waiting for %ld (bfree = %d," + " est_bfree = %d)\n", + fsb + fs->lfs_ravail + fs->lfs_favail, + fs->lfs_bfree, LFS_EST_BFREE(fs))); + } + ++slept; + + /* Wake up the cleaner */ + LFS_CLEANERINFO(cip, fs, bp); + LFS_SYNC_CLEANERINFO(cip, fs, bp, 0); + lfs_wakeup_cleaner(fs); + + mutex_enter(&lfs_lock); + /* Cleaner might have run while we were reading, check again */ + if (lfs_fits(fs, fsb + fs->lfs_ravail + fs->lfs_favail)) + break; + + error = mtsleep(&fs->lfs_avail, PCATCH | PUSER, "lfs_reserve", + 0, &lfs_lock); +#if 0 + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX use lockstatus */ + vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY); /* XXX use lockstatus */ +#endif + if (error) { + mutex_exit(&lfs_lock); + return error; + } + } +#ifdef DEBUG + if (slept) { + DLOG((DLOG_AVAIL, "lfs_reserve: woke up\n")); + } +#endif + fs->lfs_ravail += fsb; + mutex_exit(&lfs_lock); + + return 0; +} + +#ifdef DIAGNOSTIC +int lfs_rescount; +int lfs_rescountdirop; +#endif + +int +lfs_reserve(struct lfs *fs, struct vnode *vp, struct vnode *vp2, int fsb) +{ + int error; + int cantwait; + + ASSERT_MAYBE_SEGLOCK(fs); + if (vp2) { + /* Make sure we're not in the process of reclaiming vp2 */ + mutex_enter(&lfs_lock); + while(fs->lfs_flags & LFS_UNDIROP) { + mtsleep(&fs->lfs_flags, PRIBIO + 1, "lfsrundirop", 0, + &lfs_lock); + } + mutex_exit(&lfs_lock); + } + + KASSERT(fsb < 0 || VOP_ISLOCKED(vp)); + KASSERT(vp2 == NULL || fsb < 0 || VOP_ISLOCKED(vp2)); + KASSERT(vp2 == NULL || !(VTOI(vp2)->i_flag & IN_ADIROP)); + KASSERT(vp2 == NULL || vp2 != fs->lfs_unlockvp); + + cantwait = (VTOI(vp)->i_flag & IN_ADIROP) || fs->lfs_unlockvp == vp; +#ifdef DIAGNOSTIC + if (cantwait) { + if (fsb > 0) + lfs_rescountdirop++; + else if (fsb < 0) + lfs_rescountdirop--; + if (lfs_rescountdirop < 0) + panic("lfs_rescountdirop"); + } + else { + if (fsb > 0) + lfs_rescount++; + else if (fsb < 0) + lfs_rescount--; + if (lfs_rescount < 0) + panic("lfs_rescount"); + } +#endif + if (cantwait) + return 0; + + /* + * XXX + * vref vnodes here so that cleaner doesn't try to reuse them. + * (see XXX comment in lfs_reserveavail) + */ + vhold(vp); + if (vp2 != NULL) { + vhold(vp2); + } + + error = lfs_reserveavail(fs, vp, vp2, fsb); + if (error) + goto done; + + /* + * XXX just a guess. should be more precise. + */ + error = lfs_reservebuf(fs, vp, vp2, fsb, fsbtob(fs, fsb)); + if (error) + lfs_reserveavail(fs, vp, vp2, -fsb); + +done: + holdrele(vp); + if (vp2 != NULL) { + holdrele(vp2); + } + + return error; +} + +int +lfs_bwrite(void *v) +{ + struct vop_bwrite_args /* { + struct vnode *a_vp; + struct buf *a_bp; + } */ *ap = v; + struct buf *bp = ap->a_bp; + +#ifdef DIAGNOSTIC + if (VTOI(bp->b_vp)->i_lfs->lfs_ronly == 0 && (bp->b_flags & B_ASYNC)) { + panic("bawrite LFS buffer"); + } +#endif /* DIAGNOSTIC */ + return lfs_bwrite_ext(bp, 0); +} + +/* + * Determine if there is enough room currently available to write fsb + * blocks. We need enough blocks for the new blocks, the current + * inode blocks (including potentially the ifile inode), a summary block, + * and the segment usage table, plus an ifile block. + */ +int +lfs_fits(struct lfs *fs, int fsb) +{ + int needed; + + ASSERT_NO_SEGLOCK(fs); + needed = fsb + btofsb(fs, fs->lfs_sumsize) + + ((howmany(fs->lfs_uinodes + 1, INOPB(fs)) + fs->lfs_segtabsz + + 1) << (fs->lfs_bshift - fs->lfs_ffshift)); + + if (needed >= fs->lfs_avail) { +#ifdef DEBUG + DLOG((DLOG_AVAIL, "lfs_fits: no fit: fsb = %ld, uinodes = %ld, " + "needed = %ld, avail = %ld\n", + (long)fsb, (long)fs->lfs_uinodes, (long)needed, + (long)fs->lfs_avail)); +#endif + return 0; + } + return 1; +} + +int +lfs_availwait(struct lfs *fs, int fsb) +{ + int error; + CLEANERINFO *cip; + struct buf *cbp; + + ASSERT_NO_SEGLOCK(fs); + /* Push cleaner blocks through regardless */ + mutex_enter(&lfs_lock); + if (LFS_SEGLOCK_HELD(fs) && + fs->lfs_sp->seg_flags & (SEGM_CLEAN | SEGM_FORCE_CKP)) { + mutex_exit(&lfs_lock); + return 0; + } + mutex_exit(&lfs_lock); + + while (!lfs_fits(fs, fsb)) { + /* + * Out of space, need cleaner to run. + * Update the cleaner info, then wake it up. + * Note the cleanerinfo block is on the ifile + * so it CANT_WAIT. + */ + LFS_CLEANERINFO(cip, fs, cbp); + LFS_SYNC_CLEANERINFO(cip, fs, cbp, 0); + +#ifdef DEBUG + DLOG((DLOG_AVAIL, "lfs_availwait: out of available space, " + "waiting on cleaner\n")); +#endif + + lfs_wakeup_cleaner(fs); +#ifdef DIAGNOSTIC + if (LFS_SEGLOCK_HELD(fs)) + panic("lfs_availwait: deadlock"); +#endif + error = tsleep(&fs->lfs_avail, PCATCH | PUSER, "cleaner", 0); + if (error) + return (error); + } + return 0; +} + +int +lfs_bwrite_ext(struct buf *bp, int flags) +{ + struct lfs *fs; + struct inode *ip; + struct vnode *vp; + int fsb; + + vp = bp->b_vp; + fs = VFSTOUFS(vp->v_mount)->um_lfs; + + ASSERT_MAYBE_SEGLOCK(fs); + KASSERT(bp->b_cflags & BC_BUSY); + KASSERT(flags & BW_CLEAN || !LFS_IS_MALLOC_BUF(bp)); + KASSERT(((bp->b_oflags | bp->b_flags) & (BO_DELWRI|B_LOCKED)) + != BO_DELWRI); + + /* + * Don't write *any* blocks if we're mounted read-only, or + * if we are "already unmounted". + * + * In particular the cleaner can't write blocks either. + */ + if (fs->lfs_ronly || (fs->lfs_pflags & LFS_PF_CLEAN)) { + bp->b_oflags &= ~BO_DELWRI; + bp->b_flags |= B_READ; + bp->b_error = 0; + mutex_enter(&bufcache_lock); + LFS_UNLOCK_BUF(bp); + if (LFS_IS_MALLOC_BUF(bp)) + bp->b_cflags &= ~BC_BUSY; + else + brelsel(bp, 0); + mutex_exit(&bufcache_lock); + return (fs->lfs_ronly ? EROFS : 0); + } + + /* + * Set the delayed write flag and use reassignbuf to move the buffer + * from the clean list to the dirty one. + * + * Set the B_LOCKED flag and unlock the buffer, causing brelse to move + * the buffer onto the LOCKED free list. This is necessary, otherwise + * getnewbuf() would try to reclaim the buffers using bawrite, which + * isn't going to work. + * + * XXX we don't let meta-data writes run out of space because they can + * come from the segment writer. We need to make sure that there is + * enough space reserved so that there's room to write meta-data + * blocks. + */ + if ((bp->b_flags & B_LOCKED) == 0) { + fsb = numfrags(fs, bp->b_bcount); + + ip = VTOI(vp); + mutex_enter(&lfs_lock); + if (flags & BW_CLEAN) { + LFS_SET_UINO(ip, IN_CLEANING); + } else { + LFS_SET_UINO(ip, IN_MODIFIED); + } + mutex_exit(&lfs_lock); + fs->lfs_avail -= fsb; + + mutex_enter(&bufcache_lock); + mutex_enter(vp->v_interlock); + bp->b_oflags = (bp->b_oflags | BO_DELWRI) & ~BO_DONE; + LFS_LOCK_BUF(bp); + bp->b_flags &= ~B_READ; + bp->b_error = 0; + reassignbuf(bp, bp->b_vp); + mutex_exit(vp->v_interlock); + } else { + mutex_enter(&bufcache_lock); + } + + if (bp->b_iodone != NULL) + bp->b_cflags &= ~BC_BUSY; + else + brelsel(bp, 0); + mutex_exit(&bufcache_lock); + + return (0); +} + +/* + * Called and return with the lfs_lock held. + */ +void +lfs_flush_fs(struct lfs *fs, int flags) +{ + ASSERT_NO_SEGLOCK(fs); + KASSERT(mutex_owned(&lfs_lock)); + if (fs->lfs_ronly) + return; + + if (lfs_dostats) + ++lfs_stats.flush_invoked; + + mutex_exit(&lfs_lock); + lfs_writer_enter(fs, "fldirop"); + lfs_segwrite(fs->lfs_ivnode->v_mount, flags); + lfs_writer_leave(fs); + mutex_enter(&lfs_lock); + fs->lfs_favail = 0; /* XXX */ +} + +/* + * This routine initiates segment writes when LFS is consuming too many + * resources. Ideally the pageout daemon would be able to direct LFS + * more subtly. + * XXX We have one static count of locked buffers; + * XXX need to think more about the multiple filesystem case. + * + * Called and return with lfs_lock held. + * If fs != NULL, we hold the segment lock for fs. + */ +void +lfs_flush(struct lfs *fs, int flags, int only_onefs) +{ + extern u_int64_t locked_fakequeue_count; + struct mount *mp, *nmp; + struct lfs *tfs; + + KASSERT(mutex_owned(&lfs_lock)); + KDASSERT(fs == NULL || !LFS_SEGLOCK_HELD(fs)); + + if (lfs_dostats) + ++lfs_stats.write_exceeded; + /* XXX should we include SEGM_CKP here? */ + if (lfs_writing && !(flags & SEGM_SYNC)) { + DLOG((DLOG_FLUSH, "lfs_flush: not flushing because another flush is active\n")); + return; + } + while (lfs_writing) + cv_wait(&lfs_writing_cv, &lfs_lock); + lfs_writing = 1; + + mutex_exit(&lfs_lock); + + if (only_onefs) { + KASSERT(fs != NULL); + if (vfs_busy(fs->lfs_ivnode->v_mount, NULL)) + goto errout; + mutex_enter(&lfs_lock); + lfs_flush_fs(fs, flags); + mutex_exit(&lfs_lock); + vfs_unbusy(fs->lfs_ivnode->v_mount, false, NULL); + } else { + locked_fakequeue_count = 0; + mutex_enter(&mountlist_lock); + for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; + mp = nmp) { + if (vfs_busy(mp, &nmp)) { + DLOG((DLOG_FLUSH, "lfs_flush: fs vfs_busy\n")); + continue; + } + if (strncmp(&mp->mnt_stat.f_fstypename[0], MOUNT_LFS, + sizeof(mp->mnt_stat.f_fstypename)) == 0) { + tfs = VFSTOUFS(mp)->um_lfs; + mutex_enter(&lfs_lock); + lfs_flush_fs(tfs, flags); + mutex_exit(&lfs_lock); + } + vfs_unbusy(mp, false, &nmp); + } + mutex_exit(&mountlist_lock); + } + LFS_DEBUG_COUNTLOCKED("flush"); + wakeup(&lfs_subsys_pages); + + errout: + mutex_enter(&lfs_lock); + KASSERT(lfs_writing); + lfs_writing = 0; + wakeup(&lfs_writing); +} + +#define INOCOUNT(fs) howmany((fs)->lfs_uinodes, INOPB(fs)) +#define INOBYTES(fs) ((fs)->lfs_uinodes * sizeof (struct ufs1_dinode)) + +/* + * make sure that we don't have too many locked buffers. + * flush buffers if needed. + */ +int +lfs_check(struct vnode *vp, daddr_t blkno, int flags) +{ + int error; + struct lfs *fs; + struct inode *ip; + extern pid_t lfs_writer_daemon; + + error = 0; + ip = VTOI(vp); + + /* If out of buffers, wait on writer */ + /* XXX KS - if it's the Ifile, we're probably the cleaner! */ + if (ip->i_number == LFS_IFILE_INUM) + return 0; + /* If we're being called from inside a dirop, don't sleep */ + if (ip->i_flag & IN_ADIROP) + return 0; + + fs = ip->i_lfs; + + ASSERT_NO_SEGLOCK(fs); + + /* + * If we would flush below, but dirops are active, sleep. + * Note that a dirop cannot ever reach this code! + */ + mutex_enter(&lfs_lock); + while (fs->lfs_dirops > 0 && + (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS || + locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES || + lfs_subsys_pages > LFS_MAX_PAGES || + fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) || + lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0)) + { + ++fs->lfs_diropwait; + mtsleep(&fs->lfs_writer, PRIBIO+1, "bufdirop", 0, + &lfs_lock); + --fs->lfs_diropwait; + } + +#ifdef DEBUG + if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS) + DLOG((DLOG_FLUSH, "lfs_check: lqc = %d, max %d\n", + locked_queue_count + INOCOUNT(fs), LFS_MAX_BUFS)); + if (locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES) + DLOG((DLOG_FLUSH, "lfs_check: lqb = %ld, max %ld\n", + locked_queue_bytes + INOBYTES(fs), LFS_MAX_BYTES)); + if (lfs_subsys_pages > LFS_MAX_PAGES) + DLOG((DLOG_FLUSH, "lfs_check: lssp = %d, max %d\n", + lfs_subsys_pages, LFS_MAX_PAGES)); + if (lfs_fs_pagetrip && fs->lfs_pages > lfs_fs_pagetrip) + DLOG((DLOG_FLUSH, "lfs_check: fssp = %d, trip at %d\n", + fs->lfs_pages, lfs_fs_pagetrip)); + if (lfs_dirvcount > LFS_MAX_DIROP) + DLOG((DLOG_FLUSH, "lfs_check: ldvc = %d, max %d\n", + lfs_dirvcount, LFS_MAX_DIROP)); + if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs)) + DLOG((DLOG_FLUSH, "lfs_check: lfdvc = %d, max %d\n", + fs->lfs_dirvcount, LFS_MAX_FSDIROP(fs))); + if (fs->lfs_diropwait > 0) + DLOG((DLOG_FLUSH, "lfs_check: ldvw = %d\n", + fs->lfs_diropwait)); +#endif + + /* If there are too many pending dirops, we have to flush them. */ + if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) || + lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) { + flags |= SEGM_CKP; + } + + if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS || + locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES || + lfs_subsys_pages > LFS_MAX_PAGES || + fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) || + lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) { + lfs_flush(fs, flags, 0); + } else if (lfs_fs_pagetrip && fs->lfs_pages > lfs_fs_pagetrip) { + /* + * If we didn't flush the whole thing, some filesystems + * still might want to be flushed. + */ + ++fs->lfs_pdflush; + wakeup(&lfs_writer_daemon); + } + + while (locked_queue_count + INOCOUNT(fs) >= LFS_WAIT_BUFS || + locked_queue_bytes + INOBYTES(fs) >= LFS_WAIT_BYTES || + lfs_subsys_pages > LFS_WAIT_PAGES || + fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) || + lfs_dirvcount > LFS_MAX_DIROP) { + + if (lfs_dostats) + ++lfs_stats.wait_exceeded; + DLOG((DLOG_AVAIL, "lfs_check: waiting: count=%d, bytes=%ld\n", + locked_queue_count, locked_queue_bytes)); + error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock, + hz * LFS_BUFWAIT); + if (error != EWOULDBLOCK) + break; + + /* + * lfs_flush might not flush all the buffers, if some of the + * inodes were locked or if most of them were Ifile blocks + * and we weren't asked to checkpoint. Try flushing again + * to keep us from blocking indefinitely. + */ + if (locked_queue_count + INOCOUNT(fs) >= LFS_MAX_BUFS || + locked_queue_bytes + INOBYTES(fs) >= LFS_MAX_BYTES) { + lfs_flush(fs, flags | SEGM_CKP, 0); + } + } + mutex_exit(&lfs_lock); + return (error); +} + +/* + * Allocate a new buffer header. + */ +struct buf * +lfs_newbuf(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size, int type) +{ + struct buf *bp; + size_t nbytes; + + ASSERT_MAYBE_SEGLOCK(fs); + nbytes = roundup(size, fsbtob(fs, 1)); + + bp = getiobuf(NULL, true); + if (nbytes) { + bp->b_data = lfs_malloc(fs, nbytes, type); + /* memset(bp->b_data, 0, nbytes); */ + } +#ifdef DIAGNOSTIC + if (vp == NULL) + panic("vp is NULL in lfs_newbuf"); + if (bp == NULL) + panic("bp is NULL after malloc in lfs_newbuf"); +#endif + + bp->b_bufsize = size; + bp->b_bcount = size; + bp->b_lblkno = daddr; + bp->b_blkno = daddr; + bp->b_error = 0; + bp->b_resid = 0; + bp->b_iodone = lfs_callback; + bp->b_cflags = BC_BUSY | BC_NOCACHE; + bp->b_private = fs; + + mutex_enter(&bufcache_lock); + mutex_enter(vp->v_interlock); + bgetvp(vp, bp); + mutex_exit(vp->v_interlock); + mutex_exit(&bufcache_lock); + + return (bp); +} + +void +lfs_freebuf(struct lfs *fs, struct buf *bp) +{ + struct vnode *vp; + + if ((vp = bp->b_vp) != NULL) { + mutex_enter(&bufcache_lock); + mutex_enter(vp->v_interlock); + brelvp(bp); + mutex_exit(vp->v_interlock); + mutex_exit(&bufcache_lock); + } + if (!(bp->b_cflags & BC_INVAL)) { /* BC_INVAL indicates a "fake" buffer */ + lfs_free(fs, bp->b_data, LFS_NB_UNKNOWN); + bp->b_data = NULL; + } + putiobuf(bp); +} + +/* + * Count buffers on the "locked" queue, and compare it to a pro-forma count. + * Don't count malloced buffers, since they don't detract from the total. + */ +void +lfs_countlocked(int *count, long *bytes, const char *msg) +{ + struct buf *bp; + int n = 0; + long int size = 0L; + + mutex_enter(&bufcache_lock); + TAILQ_FOREACH(bp, &bufqueues[BQ_LOCKED].bq_queue, b_freelist) { + KASSERT(bp->b_iodone == NULL); + n++; + size += bp->b_bufsize; +#ifdef DIAGNOSTIC + if (n > nbuf) + panic("lfs_countlocked: this can't happen: more" + " buffers locked than exist"); +#endif + } + /* + * Theoretically this function never really does anything. + * Give a warning if we have to fix the accounting. + */ + if (n != *count) { + DLOG((DLOG_LLIST, "lfs_countlocked: %s: adjusted buf count" + " from %d to %d\n", msg, *count, n)); + } + if (size != *bytes) { + DLOG((DLOG_LLIST, "lfs_countlocked: %s: adjusted byte count" + " from %ld to %ld\n", msg, *bytes, size)); + } + *count = n; + *bytes = size; + mutex_exit(&bufcache_lock); + return; +} + +int +lfs_wait_pages(void) +{ + int active, inactive; + + uvm_estimatepageable(&active, &inactive); + return LFS_WAIT_RESOURCE(active + inactive + uvmexp.free, 1); +} + +int +lfs_max_pages(void) +{ + int active, inactive; + + uvm_estimatepageable(&active, &inactive); + return LFS_MAX_RESOURCE(active + inactive + uvmexp.free, 1); +} diff --git a/sys/ufs/lfs/lfs_cksum.c b/sys/ufs/lfs/lfs_cksum.c new file mode 100644 index 000000000..a3f0fb93d --- /dev/null +++ b/sys/ufs/lfs/lfs_cksum.c @@ -0,0 +1,110 @@ +/* $NetBSD: lfs_cksum.c,v 1.27 2008/04/28 20:24:11 martin Exp $ */ + +/*- + * Copyright (c) 1999, 2000, 2001, 2002 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Konrad E. Schroder . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_cksum.c 8.2 (Berkeley) 10/9/94 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: lfs_cksum.c,v 1.27 2008/04/28 20:24:11 martin Exp $"); + +#include +#ifdef _KERNEL +# include +# include +#else +# include +#endif +#include +#include +#include +#include + +/* + * Simple, general purpose, fast checksum. Data must be short-aligned. + * Returns a u_long in case we ever want to do something more rigorous. + * + * XXX + * Use the TCP/IP checksum instead. + */ +u_int32_t +lfs_cksum_part(void *str, size_t len, u_int32_t sum) +{ + + len &= ~(sizeof(u_int16_t) - 1); + for (; len; len -= sizeof(u_int16_t)) { + sum ^= *(u_int16_t *)str; + str = (void *)((u_int16_t *)str + 1); + } + return (sum); +} + +u_int32_t +cksum(void *str, size_t len) +{ + + return lfs_cksum_fold(lfs_cksum_part(str, len, 0)); +} + +u_int32_t +lfs_sb_cksum(struct dlfs *fs) +{ + size_t size; + + size = (size_t)offsetof(struct dlfs, dlfs_cksum); + return cksum(fs, size); +} diff --git a/sys/ufs/lfs/lfs_debug.c b/sys/ufs/lfs/lfs_debug.c new file mode 100644 index 000000000..ecad77204 --- /dev/null +++ b/sys/ufs/lfs/lfs_debug.c @@ -0,0 +1,325 @@ +/* $NetBSD: lfs_debug.c,v 1.39 2011/07/17 20:54:54 joerg Exp $ */ + +/*- + * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Konrad E. Schroder . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_debug.c 8.1 (Berkeley) 6/11/93 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: lfs_debug.c,v 1.39 2011/07/17 20:54:54 joerg Exp $"); + +#ifdef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +int lfs_lognum; +struct lfs_log_entry lfs_log[LFS_LOGLENGTH]; + +int +lfs_bwrite_log(struct buf *bp, const char *file, int line) +{ + struct vop_bwrite_args a; + + a.a_desc = VDESC(vop_bwrite); + a.a_bp = bp; + + if (!(bp->b_flags & B_GATHERED) && !(bp->b_oflags & BO_DELWRI)) { + LFS_ENTER_LOG("write", file, line, bp->b_lblkno, bp->b_flags, + curproc->p_pid); + } + return (VCALL(bp->b_vp, VOFFSET(vop_bwrite), &a)); +} + +void +lfs_dumplog(void) +{ + int i; + const char *cp; + + for (i = lfs_lognum; i != (lfs_lognum - 1) % LFS_LOGLENGTH; + i = (i + 1) % LFS_LOGLENGTH) + if (lfs_log[i].file) { + /* Only print out basename, for readability */ + cp = lfs_log[i].file; + while(*cp) + ++cp; + while(*cp != '/' && cp > lfs_log[i].file) + --cp; + + printf("lbn %" PRId64 " %s %lx %d, %d %s\n", + lfs_log[i].block, + lfs_log[i].op, + lfs_log[i].flags, + lfs_log[i].pid, + lfs_log[i].line, + cp); + } +} + +void +lfs_dump_super(struct lfs *lfsp) +{ + int i; + + printf("%s%x\t%s%x\t%s%d\t%s%d\n", + "magic ", lfsp->lfs_magic, + "version ", lfsp->lfs_version, + "size ", lfsp->lfs_size, + "ssize ", lfsp->lfs_ssize); + printf("%s%d\t%s%d\t%s%d\t%s%d\n", + "dsize ", lfsp->lfs_dsize, + "bsize ", lfsp->lfs_bsize, + "fsize ", lfsp->lfs_fsize, + "frag ", lfsp->lfs_frag); + + printf("%s%d\t%s%d\t%s%d\t%s%d\n", + "minfree ", lfsp->lfs_minfree, + "inopb ", lfsp->lfs_inopb, + "ifpb ", lfsp->lfs_ifpb, + "nindir ", lfsp->lfs_nindir); + + printf("%s%d\t%s%d\t%s%d\t%s%d\n", + "nseg ", lfsp->lfs_nseg, + "nspf ", lfsp->lfs_nspf, + "cleansz ", lfsp->lfs_cleansz, + "segtabsz ", lfsp->lfs_segtabsz); + + printf("%s%x\t%s%d\t%s%lx\t%s%d\n", + "segmask ", lfsp->lfs_segmask, + "segshift ", lfsp->lfs_segshift, + "bmask ", (unsigned long)lfsp->lfs_bmask, + "bshift ", lfsp->lfs_bshift); + + printf("%s%lu\t%s%d\t%s%lx\t%s%u\n", + "ffmask ", (unsigned long)lfsp->lfs_ffmask, + "ffshift ", lfsp->lfs_ffshift, + "fbmask ", (unsigned long)lfsp->lfs_fbmask, + "fbshift ", lfsp->lfs_fbshift); + + printf("%s%d\t%s%d\t%s%x\t%s%qx\n", + "sushift ", lfsp->lfs_sushift, + "fsbtodb ", lfsp->lfs_fsbtodb, + "cksum ", lfsp->lfs_cksum, + "maxfilesize ", (long long)lfsp->lfs_maxfilesize); + + printf("Superblock disk addresses:"); + for (i = 0; i < LFS_MAXNUMSB; i++) + printf(" %x", lfsp->lfs_sboffs[i]); + printf("\n"); + + printf("Checkpoint Info\n"); + printf("%s%d\t%s%x\t%s%d\n", + "freehd ", lfsp->lfs_freehd, + "idaddr ", lfsp->lfs_idaddr, + "ifile ", lfsp->lfs_ifile); + printf("%s%x\t%s%d\t%s%x\t%s%x\t%s%x\t%s%x\n", + "bfree ", lfsp->lfs_bfree, + "nfiles ", lfsp->lfs_nfiles, + "lastseg ", lfsp->lfs_lastseg, + "nextseg ", lfsp->lfs_nextseg, + "curseg ", lfsp->lfs_curseg, + "offset ", lfsp->lfs_offset); + printf("tstamp %llx\n", (long long)lfsp->lfs_tstamp); +} + +void +lfs_dump_dinode(struct ufs1_dinode *dip) +{ + int i; + + printf("%s%u\t%s%d\t%s%u\t%s%u\t%s%qu\t%s%d\n", + "mode ", dip->di_mode, + "nlink ", dip->di_nlink, + "uid ", dip->di_uid, + "gid ", dip->di_gid, + "size ", (long long)dip->di_size, + "blocks ", dip->di_blocks); + printf("inum %d\n", dip->di_inumber); + printf("Direct Addresses\n"); + for (i = 0; i < NDADDR; i++) { + printf("\t%x", dip->di_db[i]); + if ((i % 6) == 5) + printf("\n"); + } + for (i = 0; i < NIADDR; i++) + printf("\t%x", dip->di_ib[i]); + printf("\n"); +} + +void +lfs_check_segsum(struct lfs *fs, struct segment *sp, char *file, int line) +{ + int actual; +#if 0 + static int offset; +#endif + + if ((actual = 1) == 1) + return; /* XXXX not checking this anymore, really */ + + if (sp->sum_bytes_left >= FINFOSIZE + && sp->fip->fi_nblocks > 512) { + printf("%s:%d: fi_nblocks = %d\n",file,line,sp->fip->fi_nblocks); +#ifdef DDB + Debugger(); +#endif + } + + if (sp->sum_bytes_left > 484) { + printf("%s:%d: bad value (%d = -%d) for sum_bytes_left\n", + file, line, sp->sum_bytes_left, fs->lfs_sumsize-sp->sum_bytes_left); + panic("too many bytes"); + } + + actual = fs->lfs_sumsize + /* amount taken up by FINFOs */ + - ((char *)&(sp->fip->fi_blocks[sp->fip->fi_nblocks]) - (char *)(sp->segsum)) + /* amount taken up by inode blocks */ + - sizeof(int32_t)*((sp->ninodes+INOPB(fs)-1) / INOPB(fs)); +#if 0 + if (actual - sp->sum_bytes_left < offset) + { + printf("%s:%d: offset changed %d -> %d\n", file, line, + offset, actual-sp->sum_bytes_left); + offset = actual - sp->sum_bytes_left; + /* panic("byte mismatch"); */ + } +#endif +#if 0 + if (actual != sp->sum_bytes_left) + printf("%s:%d: warning: segsum miscalc at %d (-%d => %d)\n", + file, line, sp->sum_bytes_left, + fs->lfs_sumsize-sp->sum_bytes_left, + actual); +#endif + if (sp->sum_bytes_left > 0 + && ((char *)(sp->segsum))[fs->lfs_sumsize + - sizeof(int32_t) * ((sp->ninodes+INOPB(fs)-1) / INOPB(fs)) + - sp->sum_bytes_left] != '\0') { + printf("%s:%d: warning: segsum overwrite at %d (-%d => %d)\n", + file, line, sp->sum_bytes_left, + fs->lfs_sumsize-sp->sum_bytes_left, + actual); +#ifdef DDB + Debugger(); +#endif + } +} + +void +lfs_check_bpp(struct lfs *fs, struct segment *sp, char *file, int line) +{ + daddr_t blkno; + struct buf **bpp; + struct vnode *devvp; + + devvp = VTOI(fs->lfs_ivnode)->i_devvp; + blkno = (*(sp->bpp))->b_blkno; + for (bpp = sp->bpp; bpp < sp->cbpp; bpp++) { + if ((*bpp)->b_blkno != blkno) { + if ((*bpp)->b_vp == devvp) { + printf("Oops, would misplace raw block " + "0x%" PRIx64 " at 0x%" PRIx64 "\n", + (*bpp)->b_blkno, + blkno); + } else { + printf("%s:%d: misplace ino %llu lbn %" PRId64 + " at 0x%" PRIx64 " instead of " + "0x%" PRIx64 "\n", + file, line, + (unsigned long long) + VTOI((*bpp)->b_vp)->i_number, + (*bpp)->b_lblkno, + blkno, + (*bpp)->b_blkno); + } + } + blkno += fsbtodb(fs, btofsb(fs, (*bpp)->b_bcount)); + } +} + +int lfs_debug_log_subsys[DLOG_MAX]; + +/* + * Log events from various debugging areas of LFS, depending on what + * the user has enabled. + */ +void +lfs_debug_log(int subsys, const char *fmt, ...) +{ + va_list ap; + + /* If not debugging this subsys, exit */ + if (lfs_debug_log_subsys[subsys] == 0) + return; + + va_start(ap, fmt); + vlog(LOG_DEBUG, fmt, ap); + va_end(ap); +} +#endif /* DEBUG */ diff --git a/include/ufs/lfs/lfs_extern.h b/sys/ufs/lfs/lfs_extern.h similarity index 100% rename from include/ufs/lfs/lfs_extern.h rename to sys/ufs/lfs/lfs_extern.h diff --git a/sys/ufs/lfs/lfs_inode.c b/sys/ufs/lfs/lfs_inode.c new file mode 100644 index 000000000..06bb9c193 --- /dev/null +++ b/sys/ufs/lfs/lfs_inode.c @@ -0,0 +1,902 @@ +/* $NetBSD: lfs_inode.c,v 1.126 2011/11/23 19:42:10 bouyer Exp $ */ + +/*- + * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Konrad E. Schroder . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/* + * Copyright (c) 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_inode.c 8.9 (Berkeley) 5/8/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: lfs_inode.c,v 1.126 2011/11/23 19:42:10 bouyer Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_quota.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +static int lfs_update_seguse(struct lfs *, struct inode *ip, long, size_t); +static int lfs_indirtrunc (struct inode *, daddr_t, daddr_t, + daddr_t, int, long *, long *, long *, size_t *); +static int lfs_blkfree (struct lfs *, struct inode *, daddr_t, size_t, long *, size_t *); +static int lfs_vtruncbuf(struct vnode *, daddr_t, bool, int); + +/* Search a block for a specific dinode. */ +struct ufs1_dinode * +lfs_ifind(struct lfs *fs, ino_t ino, struct buf *bp) +{ + struct ufs1_dinode *dip = (struct ufs1_dinode *)bp->b_data; + struct ufs1_dinode *ldip, *fin; + + ASSERT_NO_SEGLOCK(fs); + /* + * Read the inode block backwards, since later versions of the + * inode will supercede earlier ones. Though it is unlikely, it is + * possible that the same inode will appear in the same inode block. + */ + fin = dip + INOPB(fs); + for (ldip = fin - 1; ldip >= dip; --ldip) + if (ldip->di_inumber == ino) + return (ldip); + + printf("searched %d entries\n", (int)(fin - dip)); + printf("offset is 0x%x (seg %d)\n", fs->lfs_offset, + dtosn(fs, fs->lfs_offset)); + printf("block is 0x%llx (seg %lld)\n", + (unsigned long long)dbtofsb(fs, bp->b_blkno), + (long long)dtosn(fs, dbtofsb(fs, bp->b_blkno))); + + return NULL; +} + +int +lfs_update(struct vnode *vp, const struct timespec *acc, + const struct timespec *mod, int updflags) +{ + struct inode *ip; + struct lfs *fs = VFSTOUFS(vp->v_mount)->um_lfs; + int flags; + + ASSERT_NO_SEGLOCK(fs); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (0); + ip = VTOI(vp); + + /* + * If we are called from vinvalbuf, and the file's blocks have + * already been scheduled for writing, but the writes have not + * yet completed, lfs_vflush will not be called, and vinvalbuf + * will cause a panic. So, we must wait until any pending write + * for our inode completes, if we are called with UPDATE_WAIT set. + */ + mutex_enter(vp->v_interlock); + while ((updflags & (UPDATE_WAIT|UPDATE_DIROP)) == UPDATE_WAIT && + WRITEINPROG(vp)) { + DLOG((DLOG_SEG, "lfs_update: sleeping on ino %d" + " (in progress)\n", ip->i_number)); + cv_wait(&vp->v_cv, vp->v_interlock); + } + mutex_exit(vp->v_interlock); + LFS_ITIMES(ip, acc, mod, NULL); + if (updflags & UPDATE_CLOSE) + flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED | IN_CLEANING); + else + flags = ip->i_flag & (IN_MODIFIED | IN_CLEANING); + if (flags == 0) + return (0); + + /* If sync, push back the vnode and any dirty blocks it may have. */ + if ((updflags & (UPDATE_WAIT|UPDATE_DIROP)) == UPDATE_WAIT) { + /* Avoid flushing VU_DIROP. */ + mutex_enter(&lfs_lock); + ++fs->lfs_diropwait; + while (vp->v_uflag & VU_DIROP) { + DLOG((DLOG_DIROP, "lfs_update: sleeping on inode %d" + " (dirops)\n", ip->i_number)); + DLOG((DLOG_DIROP, "lfs_update: vflags 0x%x, iflags" + " 0x%x\n", + vp->v_iflag | vp->v_vflag | vp->v_uflag, + ip->i_flag)); + if (fs->lfs_dirops == 0) + lfs_flush_fs(fs, SEGM_SYNC); + else + mtsleep(&fs->lfs_writer, PRIBIO+1, "lfs_fsync", + 0, &lfs_lock); + /* XXX KS - by falling out here, are we writing the vn + twice? */ + } + --fs->lfs_diropwait; + mutex_exit(&lfs_lock); + return lfs_vflush(vp); + } + return 0; +} + +#define SINGLE 0 /* index of single indirect block */ +#define DOUBLE 1 /* index of double indirect block */ +#define TRIPLE 2 /* index of triple indirect block */ +/* + * Truncate the inode oip to at most length size, freeing the + * disk blocks. + */ +/* VOP_BWRITE 1 + NIADDR + lfs_balloc == 2 + 2*NIADDR times */ + +int +lfs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred) +{ + daddr_t lastblock; + struct inode *oip = VTOI(ovp); + daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR]; + /* XXX ondisk32 */ + int32_t newblks[NDADDR + NIADDR]; + struct lfs *fs; + struct buf *bp; + int offset, size, level; + long count, rcount, blocksreleased = 0, real_released = 0; + int i, nblocks; + int aflags, error, allerror = 0; + off_t osize; + long lastseg; + size_t bc; + int obufsize, odb; + int usepc; + struct ufsmount *ump = oip->i_ump; + + if (ovp->v_type == VCHR || ovp->v_type == VBLK || + ovp->v_type == VFIFO || ovp->v_type == VSOCK) { + KASSERT(oip->i_size == 0); + return 0; + } + + if (length < 0) + return (EINVAL); + + /* + * Just return and not update modification times. + */ + if (oip->i_size == length) { + /* still do a uvm_vnp_setsize() as writesize may be larger */ + uvm_vnp_setsize(ovp, length); + return (0); + } + + if (ovp->v_type == VLNK && + (oip->i_size < ump->um_maxsymlinklen || + (ump->um_maxsymlinklen == 0 && + oip->i_ffs1_blocks == 0))) { +#ifdef DIAGNOSTIC + if (length != 0) + panic("lfs_truncate: partial truncate of symlink"); +#endif + memset((char *)SHORTLINK(oip), 0, (u_int)oip->i_size); + oip->i_size = oip->i_ffs1_size = 0; + oip->i_flag |= IN_CHANGE | IN_UPDATE; + return (lfs_update(ovp, NULL, NULL, 0)); + } + if (oip->i_size == length) { + oip->i_flag |= IN_CHANGE | IN_UPDATE; + return (lfs_update(ovp, NULL, NULL, 0)); + } + fs = oip->i_lfs; + lfs_imtime(fs); + osize = oip->i_size; + usepc = (ovp->v_type == VREG && ovp != fs->lfs_ivnode); + + ASSERT_NO_SEGLOCK(fs); + /* + * Lengthen the size of the file. We must ensure that the + * last byte of the file is allocated. Since the smallest + * value of osize is 0, length will be at least 1. + */ + if (osize < length) { + if (length > ump->um_maxfilesize) + return (EFBIG); + aflags = B_CLRBUF; + if (ioflag & IO_SYNC) + aflags |= B_SYNC; + if (usepc) { + if (lblkno(fs, osize) < NDADDR && + lblkno(fs, osize) != lblkno(fs, length) && + blkroundup(fs, osize) != osize) { + off_t eob; + + eob = blkroundup(fs, osize); + uvm_vnp_setwritesize(ovp, eob); + error = ufs_balloc_range(ovp, osize, + eob - osize, cred, aflags); + if (error) { + (void) lfs_truncate(ovp, osize, + ioflag & IO_SYNC, cred); + return error; + } + if (ioflag & IO_SYNC) { + mutex_enter(ovp->v_interlock); + VOP_PUTPAGES(ovp, + trunc_page(osize & fs->lfs_bmask), + round_page(eob), + PGO_CLEANIT | PGO_SYNCIO); + } + } + uvm_vnp_setwritesize(ovp, length); + error = ufs_balloc_range(ovp, length - 1, 1, cred, + aflags); + if (error) { + (void) lfs_truncate(ovp, osize, + ioflag & IO_SYNC, cred); + return error; + } + uvm_vnp_setsize(ovp, length); + oip->i_flag |= IN_CHANGE | IN_UPDATE; + KASSERT(ovp->v_size == oip->i_size); + oip->i_lfs_hiblk = lblkno(fs, oip->i_size + fs->lfs_bsize - 1) - 1; + return (lfs_update(ovp, NULL, NULL, 0)); + } else { + error = lfs_reserve(fs, ovp, NULL, + btofsb(fs, (NIADDR + 2) << fs->lfs_bshift)); + if (error) + return (error); + error = lfs_balloc(ovp, length - 1, 1, cred, + aflags, &bp); + lfs_reserve(fs, ovp, NULL, + -btofsb(fs, (NIADDR + 2) << fs->lfs_bshift)); + if (error) + return (error); + oip->i_ffs1_size = oip->i_size = length; + uvm_vnp_setsize(ovp, length); + (void) VOP_BWRITE(bp->b_vp, bp); + oip->i_flag |= IN_CHANGE | IN_UPDATE; + oip->i_lfs_hiblk = lblkno(fs, oip->i_size + fs->lfs_bsize - 1) - 1; + return (lfs_update(ovp, NULL, NULL, 0)); + } + } + + if ((error = lfs_reserve(fs, ovp, NULL, + btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift))) != 0) + return (error); + + /* + * Shorten the size of the file. If the file is not being + * truncated to a block boundary, the contents of the + * partial block following the end of the file must be + * zero'ed in case it ever becomes accessible again because + * of subsequent file growth. Directories however are not + * zero'ed as they should grow back initialized to empty. + */ + offset = blkoff(fs, length); + lastseg = -1; + bc = 0; + + if (ovp != fs->lfs_ivnode) + lfs_seglock(fs, SEGM_PROT); + if (offset == 0) { + oip->i_size = oip->i_ffs1_size = length; + } else if (!usepc) { + lbn = lblkno(fs, length); + aflags = B_CLRBUF; + if (ioflag & IO_SYNC) + aflags |= B_SYNC; + error = lfs_balloc(ovp, length - 1, 1, cred, aflags, &bp); + if (error) { + lfs_reserve(fs, ovp, NULL, + -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift)); + goto errout; + } + obufsize = bp->b_bufsize; + odb = btofsb(fs, bp->b_bcount); + oip->i_size = oip->i_ffs1_size = length; + size = blksize(fs, oip, lbn); + if (ovp->v_type != VDIR) + memset((char *)bp->b_data + offset, 0, + (u_int)(size - offset)); + allocbuf(bp, size, 1); + if ((bp->b_flags & B_LOCKED) != 0 && bp->b_iodone == NULL) { + mutex_enter(&lfs_lock); + locked_queue_bytes -= obufsize - bp->b_bufsize; + mutex_exit(&lfs_lock); + } + if (bp->b_oflags & BO_DELWRI) + fs->lfs_avail += odb - btofsb(fs, size); + (void) VOP_BWRITE(bp->b_vp, bp); + } else { /* vp->v_type == VREG && length < osize && offset != 0 */ + /* + * When truncating a regular file down to a non-block-aligned + * size, we must zero the part of last block which is past + * the new EOF. We must synchronously flush the zeroed pages + * to disk since the new pages will be invalidated as soon + * as we inform the VM system of the new, smaller size. + * We must do this before acquiring the GLOCK, since fetching + * the pages will acquire the GLOCK internally. + * So there is a window where another thread could see a whole + * zeroed page past EOF, but that's life. + */ + daddr_t xlbn; + voff_t eoz; + + aflags = ioflag & IO_SYNC ? B_SYNC : 0; + error = ufs_balloc_range(ovp, length - 1, 1, cred, aflags); + if (error) { + lfs_reserve(fs, ovp, NULL, + -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift)); + goto errout; + } + xlbn = lblkno(fs, length); + size = blksize(fs, oip, xlbn); + eoz = MIN(lblktosize(fs, xlbn) + size, osize); + ubc_zerorange(&ovp->v_uobj, length, eoz - length, + UBC_UNMAP_FLAG(ovp)); + if (round_page(eoz) > round_page(length)) { + mutex_enter(ovp->v_interlock); + error = VOP_PUTPAGES(ovp, round_page(length), + round_page(eoz), + PGO_CLEANIT | PGO_DEACTIVATE | + ((ioflag & IO_SYNC) ? PGO_SYNCIO : 0)); + if (error) { + lfs_reserve(fs, ovp, NULL, + -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift)); + goto errout; + } + } + } + + genfs_node_wrlock(ovp); + + oip->i_size = oip->i_ffs1_size = length; + uvm_vnp_setsize(ovp, length); + + /* + * Calculate index into inode's block list of + * last direct and indirect blocks (if any) + * which we want to keep. Lastblock is -1 when + * the file is truncated to 0. + */ + /* Avoid sign overflow - XXX assumes that off_t is a quad_t. */ + if (length > QUAD_MAX - fs->lfs_bsize) + lastblock = lblkno(fs, QUAD_MAX - fs->lfs_bsize); + else + lastblock = lblkno(fs, length + fs->lfs_bsize - 1) - 1; + lastiblock[SINGLE] = lastblock - NDADDR; + lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); + lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); + nblocks = btofsb(fs, fs->lfs_bsize); + /* + * Record changed file and block pointers before we start + * freeing blocks. lastiblock values are also normalized to -1 + * for calls to lfs_indirtrunc below. + */ + memcpy((void *)newblks, (void *)&oip->i_ffs1_db[0], sizeof newblks); + for (level = TRIPLE; level >= SINGLE; level--) + if (lastiblock[level] < 0) { + newblks[NDADDR+level] = 0; + lastiblock[level] = -1; + } + for (i = NDADDR - 1; i > lastblock; i--) + newblks[i] = 0; + + oip->i_size = oip->i_ffs1_size = osize; + error = lfs_vtruncbuf(ovp, lastblock + 1, false, 0); + if (error && !allerror) + allerror = error; + + /* + * Indirect blocks first. + */ + indir_lbn[SINGLE] = -NDADDR; + indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1; + indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; + for (level = TRIPLE; level >= SINGLE; level--) { + bn = oip->i_ffs1_ib[level]; + if (bn != 0) { + error = lfs_indirtrunc(oip, indir_lbn[level], + bn, lastiblock[level], + level, &count, &rcount, + &lastseg, &bc); + if (error) + allerror = error; + real_released += rcount; + blocksreleased += count; + if (lastiblock[level] < 0) { + if (oip->i_ffs1_ib[level] > 0) + real_released += nblocks; + blocksreleased += nblocks; + oip->i_ffs1_ib[level] = 0; + lfs_blkfree(fs, oip, bn, fs->lfs_bsize, + &lastseg, &bc); + lfs_deregister_block(ovp, bn); + } + } + if (lastiblock[level] >= 0) + goto done; + } + + /* + * All whole direct blocks or frags. + */ + for (i = NDADDR - 1; i > lastblock; i--) { + long bsize, obsize; + + bn = oip->i_ffs1_db[i]; + if (bn == 0) + continue; + bsize = blksize(fs, oip, i); + if (oip->i_ffs1_db[i] > 0) { + /* Check for fragment size changes */ + obsize = oip->i_lfs_fragsize[i]; + real_released += btofsb(fs, obsize); + oip->i_lfs_fragsize[i] = 0; + } else + obsize = 0; + blocksreleased += btofsb(fs, bsize); + oip->i_ffs1_db[i] = 0; + lfs_blkfree(fs, oip, bn, obsize, &lastseg, &bc); + lfs_deregister_block(ovp, bn); + } + if (lastblock < 0) + goto done; + + /* + * Finally, look for a change in size of the + * last direct block; release any frags. + */ + bn = oip->i_ffs1_db[lastblock]; + if (bn != 0) { + long oldspace, newspace; +#if 0 + long olddspace; +#endif + + /* + * Calculate amount of space we're giving + * back as old block size minus new block size. + */ + oldspace = blksize(fs, oip, lastblock); +#if 0 + olddspace = oip->i_lfs_fragsize[lastblock]; +#endif + + oip->i_size = oip->i_ffs1_size = length; + newspace = blksize(fs, oip, lastblock); + if (newspace == 0) + panic("itrunc: newspace"); + if (oldspace - newspace > 0) { + blocksreleased += btofsb(fs, oldspace - newspace); + } +#if 0 + if (bn > 0 && olddspace - newspace > 0) { + /* No segment accounting here, just vnode */ + real_released += btofsb(fs, olddspace - newspace); + } +#endif + } + +done: + /* Finish segment accounting corrections */ + lfs_update_seguse(fs, oip, lastseg, bc); +#ifdef DIAGNOSTIC + for (level = SINGLE; level <= TRIPLE; level++) + if ((newblks[NDADDR + level] == 0) != + ((oip->i_ffs1_ib[level]) == 0)) { + panic("lfs itrunc1"); + } + for (i = 0; i < NDADDR; i++) + if ((newblks[i] == 0) != (oip->i_ffs1_db[i] == 0)) { + panic("lfs itrunc2"); + } + if (length == 0 && + (!LIST_EMPTY(&ovp->v_cleanblkhd) || !LIST_EMPTY(&ovp->v_dirtyblkhd))) + panic("lfs itrunc3"); +#endif /* DIAGNOSTIC */ + /* + * Put back the real size. + */ + oip->i_size = oip->i_ffs1_size = length; + oip->i_lfs_effnblks -= blocksreleased; + oip->i_ffs1_blocks -= real_released; + mutex_enter(&lfs_lock); + fs->lfs_bfree += blocksreleased; + mutex_exit(&lfs_lock); +#ifdef DIAGNOSTIC + if (oip->i_size == 0 && + (oip->i_ffs1_blocks != 0 || oip->i_lfs_effnblks != 0)) { + printf("lfs_truncate: truncate to 0 but %d blks/%d effblks\n", + oip->i_ffs1_blocks, oip->i_lfs_effnblks); + panic("lfs_truncate: persistent blocks"); + } +#endif + + /* + * If we truncated to zero, take us off the paging queue. + */ + mutex_enter(&lfs_lock); + if (oip->i_size == 0 && oip->i_flags & IN_PAGING) { + oip->i_flags &= ~IN_PAGING; + TAILQ_REMOVE(&fs->lfs_pchainhd, oip, i_lfs_pchain); + } + mutex_exit(&lfs_lock); + + oip->i_flag |= IN_CHANGE; +#ifdef QUOTA + (void) chkdq(oip, -blocksreleased, NOCRED, 0); +#endif + lfs_reserve(fs, ovp, NULL, + -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift)); + genfs_node_unlock(ovp); + errout: + oip->i_lfs_hiblk = lblkno(fs, oip->i_size + fs->lfs_bsize - 1) - 1; + if (ovp != fs->lfs_ivnode) + lfs_segunlock(fs); + return (allerror ? allerror : error); +} + +/* Update segment and avail usage information when removing a block. */ +static int +lfs_blkfree(struct lfs *fs, struct inode *ip, daddr_t daddr, + size_t bsize, long *lastseg, size_t *num) +{ + long seg; + int error = 0; + + ASSERT_SEGLOCK(fs); + bsize = fragroundup(fs, bsize); + if (daddr > 0) { + if (*lastseg != (seg = dtosn(fs, daddr))) { + error = lfs_update_seguse(fs, ip, *lastseg, *num); + *num = bsize; + *lastseg = seg; + } else + *num += bsize; + } + + return error; +} + +/* Finish the accounting updates for a segment. */ +static int +lfs_update_seguse(struct lfs *fs, struct inode *ip, long lastseg, size_t num) +{ + struct segdelta *sd; + struct vnode *vp; + + ASSERT_SEGLOCK(fs); + if (lastseg < 0 || num == 0) + return 0; + + vp = ITOV(ip); + LIST_FOREACH(sd, &ip->i_lfs_segdhd, list) + if (sd->segnum == lastseg) + break; + if (sd == NULL) { + sd = malloc(sizeof(*sd), M_SEGMENT, M_WAITOK); + sd->segnum = lastseg; + sd->num = 0; + LIST_INSERT_HEAD(&ip->i_lfs_segdhd, sd, list); + } + sd->num += num; + + return 0; +} + +static void +lfs_finalize_seguse(struct lfs *fs, void *v) +{ + SEGUSE *sup; + struct buf *bp; + struct segdelta *sd; + LIST_HEAD(, segdelta) *hd = v; + + ASSERT_SEGLOCK(fs); + while((sd = LIST_FIRST(hd)) != NULL) { + LIST_REMOVE(sd, list); + LFS_SEGENTRY(sup, fs, sd->segnum, bp); + if (sd->num > sup->su_nbytes) { + printf("lfs_finalize_seguse: segment %ld short by %ld\n", + sd->segnum, (long)(sd->num - sup->su_nbytes)); + panic("lfs_finalize_seguse: negative bytes"); + sup->su_nbytes = sd->num; + } + sup->su_nbytes -= sd->num; + LFS_WRITESEGENTRY(sup, fs, sd->segnum, bp); + free(sd, M_SEGMENT); + } +} + +/* Finish the accounting updates for a segment. */ +void +lfs_finalize_ino_seguse(struct lfs *fs, struct inode *ip) +{ + ASSERT_SEGLOCK(fs); + lfs_finalize_seguse(fs, &ip->i_lfs_segdhd); +} + +/* Finish the accounting updates for a segment. */ +void +lfs_finalize_fs_seguse(struct lfs *fs) +{ + ASSERT_SEGLOCK(fs); + lfs_finalize_seguse(fs, &fs->lfs_segdhd); +} + +/* + * Release blocks associated with the inode ip and stored in the indirect + * block bn. Blocks are free'd in LIFO order up to (but not including) + * lastbn. If level is greater than SINGLE, the block is an indirect block + * and recursive calls to indirtrunc must be used to cleanse other indirect + * blocks. + * + * NB: triple indirect blocks are untested. + */ +static int +lfs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, + daddr_t lastbn, int level, long *countp, + long *rcountp, long *lastsegp, size_t *bcp) +{ + int i; + struct buf *bp; + struct lfs *fs = ip->i_lfs; + int32_t *bap; /* XXX ondisk32 */ + struct vnode *vp; + daddr_t nb, nlbn, last; + int32_t *copy = NULL; /* XXX ondisk32 */ + long blkcount, rblkcount, factor; + int nblocks, blocksreleased = 0, real_released = 0; + int error = 0, allerror = 0; + + ASSERT_SEGLOCK(fs); + /* + * Calculate index in current block of last + * block to be kept. -1 indicates the entire + * block so we need not calculate the index. + */ + factor = 1; + for (i = SINGLE; i < level; i++) + factor *= NINDIR(fs); + last = lastbn; + if (lastbn > 0) + last /= factor; + nblocks = btofsb(fs, fs->lfs_bsize); + /* + * Get buffer of block pointers, zero those entries corresponding + * to blocks to be free'd, and update on disk copy first. Since + * double(triple) indirect before single(double) indirect, calls + * to bmap on these blocks will fail. However, we already have + * the on disk address, so we have to set the b_blkno field + * explicitly instead of letting bread do everything for us. + */ + vp = ITOV(ip); + bp = getblk(vp, lbn, (int)fs->lfs_bsize, 0, 0); + if (bp->b_oflags & (BO_DONE | BO_DELWRI)) { + /* Braces must be here in case trace evaluates to nothing. */ + trace(TR_BREADHIT, pack(vp, fs->lfs_bsize), lbn); + } else { + trace(TR_BREADMISS, pack(vp, fs->lfs_bsize), lbn); + curlwp->l_ru.ru_inblock++; /* pay for read */ + bp->b_flags |= B_READ; + if (bp->b_bcount > bp->b_bufsize) + panic("lfs_indirtrunc: bad buffer size"); + bp->b_blkno = fsbtodb(fs, dbn); + VOP_STRATEGY(vp, bp); + error = biowait(bp); + } + if (error) { + brelse(bp, 0); + *countp = *rcountp = 0; + return (error); + } + + bap = (int32_t *)bp->b_data; /* XXX ondisk32 */ + if (lastbn >= 0) { + copy = (int32_t *)lfs_malloc(fs, fs->lfs_bsize, LFS_NB_IBLOCK); + memcpy((void *)copy, (void *)bap, (u_int)fs->lfs_bsize); + memset((void *)&bap[last + 1], 0, + /* XXX ondisk32 */ + (u_int)(NINDIR(fs) - (last + 1)) * sizeof (int32_t)); + error = VOP_BWRITE(bp->b_vp, bp); + if (error) + allerror = error; + bap = copy; + } + + /* + * Recursively free totally unused blocks. + */ + for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; + i--, nlbn += factor) { + nb = bap[i]; + if (nb == 0) + continue; + if (level > SINGLE) { + error = lfs_indirtrunc(ip, nlbn, nb, + (daddr_t)-1, level - 1, + &blkcount, &rblkcount, + lastsegp, bcp); + if (error) + allerror = error; + blocksreleased += blkcount; + real_released += rblkcount; + } + lfs_blkfree(fs, ip, nb, fs->lfs_bsize, lastsegp, bcp); + if (bap[i] > 0) + real_released += nblocks; + blocksreleased += nblocks; + } + + /* + * Recursively free last partial block. + */ + if (level > SINGLE && lastbn >= 0) { + last = lastbn % factor; + nb = bap[i]; + if (nb != 0) { + error = lfs_indirtrunc(ip, nlbn, nb, + last, level - 1, &blkcount, + &rblkcount, lastsegp, bcp); + if (error) + allerror = error; + real_released += rblkcount; + blocksreleased += blkcount; + } + } + + if (copy != NULL) { + lfs_free(fs, copy, LFS_NB_IBLOCK); + } else { + mutex_enter(&bufcache_lock); + if (bp->b_oflags & BO_DELWRI) { + LFS_UNLOCK_BUF(bp); + fs->lfs_avail += btofsb(fs, bp->b_bcount); + wakeup(&fs->lfs_avail); + } + brelsel(bp, BC_INVAL); + mutex_exit(&bufcache_lock); + } + + *countp = blocksreleased; + *rcountp = real_released; + return (allerror); +} + +/* + * Destroy any in core blocks past the truncation length. + * Inlined from vtruncbuf, so that lfs_avail could be updated. + * We take the seglock to prevent cleaning from occurring while we are + * invalidating blocks. + */ +static int +lfs_vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo) +{ + struct buf *bp, *nbp; + int error; + struct lfs *fs; + voff_t off; + + off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); + mutex_enter(vp->v_interlock); + error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); + if (error) + return error; + + fs = VTOI(vp)->i_lfs; + + ASSERT_SEGLOCK(fs); + + mutex_enter(&bufcache_lock); +restart: + for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); + if (bp->b_lblkno < lbn) + continue; + error = bbusy(bp, catch, slptimeo, NULL); + if (error == EPASSTHROUGH) + goto restart; + if (error != 0) { + mutex_exit(&bufcache_lock); + return (error); + } + mutex_enter(bp->b_objlock); + if (bp->b_oflags & BO_DELWRI) { + bp->b_oflags &= ~BO_DELWRI; + fs->lfs_avail += btofsb(fs, bp->b_bcount); + wakeup(&fs->lfs_avail); + } + mutex_exit(bp->b_objlock); + LFS_UNLOCK_BUF(bp); + brelsel(bp, BC_INVAL | BC_VFLUSH); + } + + for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); + if (bp->b_lblkno < lbn) + continue; + error = bbusy(bp, catch, slptimeo, NULL); + if (error == EPASSTHROUGH) + goto restart; + if (error != 0) { + mutex_exit(&bufcache_lock); + return (error); + } + mutex_enter(bp->b_objlock); + if (bp->b_oflags & BO_DELWRI) { + bp->b_oflags &= ~BO_DELWRI; + fs->lfs_avail += btofsb(fs, bp->b_bcount); + wakeup(&fs->lfs_avail); + } + mutex_exit(bp->b_objlock); + LFS_UNLOCK_BUF(bp); + brelsel(bp, BC_INVAL | BC_VFLUSH); + } + mutex_exit(&bufcache_lock); + + return (0); +} + diff --git a/sys/ufs/lfs/lfs_itimes.c b/sys/ufs/lfs/lfs_itimes.c new file mode 100644 index 000000000..3ef9f86c4 --- /dev/null +++ b/sys/ufs/lfs/lfs_itimes.c @@ -0,0 +1,118 @@ +/* $NetBSD: lfs_itimes.c,v 1.12 2008/04/28 20:24:11 martin Exp $ */ + +/*- + * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Konrad E. Schroder . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +__KERNEL_RCSID(0, "$NetBSD: lfs_itimes.c,v 1.12 2008/04/28 20:24:11 martin Exp $"); + +#include +#include +#include +#include + +#include + +#ifndef _KERNEL +#include "bufcache.h" +#include "vnode.h" +#include "lfs_user.h" +#define vnode uvnode +#define buf ubuf +#define panic call_panic +#else +#include +#include +#endif + +#include + +void +lfs_itimes(struct inode *ip, const struct timespec *acc, + const struct timespec *mod, const struct timespec *cre) +{ +#ifdef _KERNEL + struct timespec now; + + KASSERT(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY)); + + vfs_timestamp(&now); +#endif + + if (ip->i_flag & IN_ACCESS) { +#ifdef _KERNEL + if (acc == NULL) + acc = &now; +#endif + ip->i_ffs1_atime = acc->tv_sec; + ip->i_ffs1_atimensec = acc->tv_nsec; + if (ip->i_lfs->lfs_version > 1) { + struct lfs *fs = ip->i_lfs; + struct buf *ibp; + IFILE *ifp; + + LFS_IENTRY(ifp, ip->i_lfs, ip->i_number, ibp); + ifp->if_atime_sec = acc->tv_sec; + ifp->if_atime_nsec = acc->tv_nsec; + LFS_BWRITE_LOG(ibp); + mutex_enter(&lfs_lock); + fs->lfs_flags |= LFS_IFDIRTY; + mutex_exit(&lfs_lock); + } else { + mutex_enter(&lfs_lock); + LFS_SET_UINO(ip, IN_ACCESSED); + mutex_exit(&lfs_lock); + } + } + if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFY)) { + if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) { +#ifdef _KERNEL + if (mod == NULL) + mod = &now; +#endif + ip->i_ffs1_mtime = mod->tv_sec; + ip->i_ffs1_mtimensec = mod->tv_nsec; + ip->i_modrev++; + } + if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) { +#ifdef _KERNEL + if (cre == NULL) + cre = &now; +#endif + ip->i_ffs1_ctime = cre->tv_sec; + ip->i_ffs1_ctimensec = cre->tv_nsec; + } + mutex_enter(&lfs_lock); + if (ip->i_flag & (IN_CHANGE | IN_UPDATE)) + LFS_SET_UINO(ip, IN_MODIFIED); + if (ip->i_flag & IN_MODIFY) + LFS_SET_UINO(ip, IN_ACCESSED); + mutex_exit(&lfs_lock); + } + ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY); +} diff --git a/sys/ufs/lfs/lfs_rfw.c b/sys/ufs/lfs/lfs_rfw.c new file mode 100644 index 000000000..60d926ee4 --- /dev/null +++ b/sys/ufs/lfs/lfs_rfw.c @@ -0,0 +1,702 @@ +/* $NetBSD: lfs_rfw.c,v 1.12 2009/02/22 20:28:07 ad Exp $ */ + +/*- + * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Konrad E. Schroder . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.12 2009/02/22 20:28:07 ad Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_quota.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +/* + * Roll-forward code. + */ +static daddr_t check_segsum(struct lfs *, daddr_t, u_int64_t, + kauth_cred_t, int, int *, struct lwp *); + +extern int lfs_do_rfw; + +/* + * Allocate a particular inode with a particular version number, freeing + * any previous versions of this inode that may have gone before. + * Used by the roll-forward code. + * + * XXX this function does not have appropriate locking to be used on a live fs; + * XXX but something similar could probably be used for an "undelete" call. + * + * Called with the Ifile inode locked. + */ +int +lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l, + struct vnode **vpp) +{ + IFILE *ifp; + struct buf *bp, *cbp; + struct vnode *vp; + struct inode *ip; + ino_t tino, oldnext; + int error; + CLEANERINFO *cip; + + ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */ + + /* + * First, just try a vget. If the version number is the one we want, + * we don't have to do anything else. If the version number is wrong, + * take appropriate action. + */ + error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp); + if (error == 0) { + DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n", ino, vp)); + + *vpp = vp; + ip = VTOI(vp); + if (ip->i_gen == vers) + return 0; + else if (ip->i_gen < vers) { + lfs_truncate(vp, (off_t)0, 0, NOCRED); + ip->i_gen = ip->i_ffs1_gen = vers; + LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); + return 0; + } else { + DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n", + ino, vers, ip->i_ffs1_gen)); + vput(vp); + *vpp = NULLVP; + return EEXIST; + } + } + + /* + * The inode is not in use. Find it on the free list. + */ + /* If the Ifile is too short to contain this inum, extend it */ + while (VTOI(fs->lfs_ivnode)->i_size <= (ino / + fs->lfs_ifpb + fs->lfs_cleansz + fs->lfs_segtabsz) + << fs->lfs_bshift) { + lfs_extend_ifile(fs, NOCRED); + } + + LFS_IENTRY(ifp, fs, ino, bp); + oldnext = ifp->if_nextfree; + ifp->if_version = vers; + brelse(bp, 0); + + LFS_GET_HEADFREE(fs, cip, cbp, &ino); + if (ino) { + LFS_PUT_HEADFREE(fs, cip, cbp, oldnext); + } else { + tino = ino; + while (1) { + LFS_IENTRY(ifp, fs, tino, bp); + if (ifp->if_nextfree == ino || + ifp->if_nextfree == LFS_UNUSED_INUM) + break; + tino = ifp->if_nextfree; + brelse(bp, 0); + } + if (ifp->if_nextfree == LFS_UNUSED_INUM) { + brelse(bp, 0); + return ENOENT; + } + ifp->if_nextfree = oldnext; + LFS_BWRITE_LOG(bp); + } + + error = lfs_ialloc(fs, fs->lfs_ivnode, ino, vers, &vp); + if (error == 0) { + /* + * Make it VREG so we can put blocks on it. We will change + * this later if it turns out to be some other kind of file. + */ + ip = VTOI(vp); + ip->i_mode = ip->i_ffs1_mode = IFREG; + ip->i_nlink = ip->i_ffs1_nlink = 1; + ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, &vp); + ip = VTOI(vp); + + DLOG((DLOG_RF, "lfs_rf_valloc: ino %d vp %p\n", ino, vp)); + + /* The dirop-nature of this vnode is past */ + lfs_unmark_vnode(vp); + (void)lfs_vunref(vp); + vp->v_uflag &= ~VU_DIROP; + mutex_enter(&lfs_lock); + --lfs_dirvcount; + --fs->lfs_dirvcount; + TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain); + wakeup(&lfs_dirvcount); + wakeup(&fs->lfs_dirvcount); + mutex_exit(&lfs_lock); + } + *vpp = vp; + return error; +} + +/* + * Load the appropriate indirect block, and change the appropriate pointer. + * Mark the block dirty. Do segment and avail accounting. + */ +static int +update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn, + daddr_t ndaddr, size_t size, struct lwp *l) +{ + int error; + struct vnode *vp; + struct inode *ip; +#ifdef DEBUG + daddr_t odaddr; + struct indir a[NIADDR]; + int num; + int i; +#endif /* DEBUG */ + struct buf *bp; + SEGUSE *sup; + + KASSERT(lbn >= 0); /* no indirect blocks */ + + if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp)) != 0) { + DLOG((DLOG_RF, "update_meta: ino %d: lfs_rf_valloc" + " returned %d\n", ino, error)); + return error; + } + + if ((error = lfs_balloc(vp, (lbn << fs->lfs_bshift), size, + NOCRED, 0, &bp)) != 0) { + vput(vp); + return (error); + } + /* No need to write, the block is already on disk */ + if (bp->b_oflags & BO_DELWRI) { + LFS_UNLOCK_BUF(bp); + fs->lfs_avail += btofsb(fs, bp->b_bcount); + } + brelse(bp, BC_INVAL); + + /* + * Extend the file, if it is not large enough already. + * XXX this is not exactly right, we don't know how much of the + * XXX last block is actually used. We hope that an inode will + * XXX appear later to give the correct size. + */ + ip = VTOI(vp); + if (ip->i_size <= (lbn << fs->lfs_bshift)) { + u_int64_t newsize; + + if (lbn < NDADDR) + newsize = ip->i_ffs1_size = (lbn << fs->lfs_bshift) + + (size - fs->lfs_fsize) + 1; + else + newsize = ip->i_ffs1_size = (lbn << fs->lfs_bshift) + 1; + + if (ip->i_size < newsize) { + ip->i_size = newsize; + /* + * tell vm our new size for the case the inode won't + * appear later. + */ + uvm_vnp_setsize(vp, newsize); + } + } + + lfs_update_single(fs, NULL, vp, lbn, ndaddr, size); + + LFS_SEGENTRY(sup, fs, dtosn(fs, ndaddr), bp); + sup->su_nbytes += size; + LFS_WRITESEGENTRY(sup, fs, dtosn(fs, ndaddr), bp); + + /* differences here should be due to UNWRITTEN indirect blocks. */ + KASSERT((lblkno(fs, ip->i_size) > NDADDR && + ip->i_lfs_effnblks == ip->i_ffs1_blocks) || + ip->i_lfs_effnblks >= ip->i_ffs1_blocks); + +#ifdef DEBUG + /* Now look again to make sure it worked */ + ufs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); + for (i = num; i > 0; i--) { + if (!a[i].in_exists) + panic("update_meta: absent %d lv indirect block", i); + } + if (dbtofsb(fs, odaddr) != ndaddr) + DLOG((DLOG_RF, "update_meta: failed setting ino %d lbn %" + PRId64 " to %" PRId64 "\n", ino, lbn, ndaddr)); +#endif /* DEBUG */ + vput(vp); + return 0; +} + +static int +update_inoblk(struct lfs *fs, daddr_t offset, kauth_cred_t cred, + struct lwp *l) +{ + struct vnode *devvp, *vp; + struct inode *ip; + struct ufs1_dinode *dip; + struct buf *dbp, *ibp; + int error; + daddr_t daddr; + IFILE *ifp; + SEGUSE *sup; + + devvp = VTOI(fs->lfs_ivnode)->i_devvp; + + /* + * Get the inode, update times and perms. + * DO NOT update disk blocks, we do that separately. + */ + error = bread(devvp, fsbtodb(fs, offset), fs->lfs_ibsize, + cred, 0, &dbp); + if (error) { + DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error)); + return error; + } + dip = ((struct ufs1_dinode *)(dbp->b_data)) + INOPB(fs); + while (--dip >= (struct ufs1_dinode *)dbp->b_data) { + if (dip->di_inumber > LFS_IFILE_INUM) { + error = lfs_rf_valloc(fs, dip->di_inumber, dip->di_gen, + l, &vp); + if (error) { + DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc" + " returned %d\n", error)); + continue; + } + ip = VTOI(vp); + if (dip->di_size != ip->i_size) + lfs_truncate(vp, dip->di_size, 0, NOCRED); + /* Get mode, link count, size, and times */ + memcpy(ip->i_din.ffs1_din, dip, + offsetof(struct ufs1_dinode, di_db[0])); + + /* Then the rest, except di_blocks */ + ip->i_flags = ip->i_ffs1_flags = dip->di_flags; + ip->i_gen = ip->i_ffs1_gen = dip->di_gen; + ip->i_uid = ip->i_ffs1_uid = dip->di_uid; + ip->i_gid = ip->i_ffs1_gid = dip->di_gid; + + ip->i_mode = ip->i_ffs1_mode; + ip->i_nlink = ip->i_ffs1_nlink; + ip->i_size = ip->i_ffs1_size; + + LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); + + /* Re-initialize to get type right */ + ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, + &vp); + vput(vp); + + /* Record change in location */ + LFS_IENTRY(ifp, fs, dip->di_inumber, ibp); + daddr = ifp->if_daddr; + ifp->if_daddr = dbtofsb(fs, dbp->b_blkno); + error = LFS_BWRITE_LOG(ibp); /* Ifile */ + /* And do segment accounting */ + if (dtosn(fs, daddr) != dtosn(fs, dbtofsb(fs, dbp->b_blkno))) { + if (daddr > 0) { + LFS_SEGENTRY(sup, fs, dtosn(fs, daddr), + ibp); + sup->su_nbytes -= sizeof (struct ufs1_dinode); + LFS_WRITESEGENTRY(sup, fs, + dtosn(fs, daddr), + ibp); + } + LFS_SEGENTRY(sup, fs, dtosn(fs, dbtofsb(fs, dbp->b_blkno)), + ibp); + sup->su_nbytes += sizeof (struct ufs1_dinode); + LFS_WRITESEGENTRY(sup, fs, + dtosn(fs, dbtofsb(fs, dbp->b_blkno)), + ibp); + } + } + } + brelse(dbp, BC_AGE); + + return 0; +} + +#define CHECK_CKSUM 0x0001 /* Check the checksum to make sure it's valid */ +#define CHECK_UPDATE 0x0002 /* Update Ifile for new data blocks / inodes */ + +static daddr_t +check_segsum(struct lfs *fs, daddr_t offset, u_int64_t nextserial, + kauth_cred_t cred, int flags, int *pseg_flags, struct lwp *l) +{ + struct vnode *devvp; + struct buf *bp, *dbp; + int error, nblocks = 0, ninos, i, j; /* XXX: gcc */ + SEGSUM *ssp; + u_long *dp = NULL, *datap = NULL; /* XXX u_int32_t */ + daddr_t oldoffset; + int32_t *iaddr; /* XXX ondisk32 */ + FINFO *fip; + SEGUSE *sup; + size_t size; + + devvp = VTOI(fs->lfs_ivnode)->i_devvp; + /* + * If the segment has a superblock and we're at the top + * of the segment, skip the superblock. + */ + if (sntod(fs, dtosn(fs, offset)) == offset) { + LFS_SEGENTRY(sup, fs, dtosn(fs, offset), bp); + if (sup->su_flags & SEGUSE_SUPERBLOCK) + offset += btofsb(fs, LFS_SBPAD); + brelse(bp, 0); + } + + /* Read in the segment summary */ + error = bread(devvp, fsbtodb(fs, offset), fs->lfs_sumsize, + cred, 0, &bp); + if (error) + return -1; + + /* Check summary checksum */ + ssp = (SEGSUM *)bp->b_data; + if (flags & CHECK_CKSUM) { + if (ssp->ss_sumsum != cksum(&ssp->ss_datasum, + fs->lfs_sumsize - + sizeof(ssp->ss_sumsum))) { + DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n", offset)); + offset = -1; + goto err1; + } + if (ssp->ss_nfinfo == 0 && ssp->ss_ninos == 0) { + DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n", offset)); + offset = -1; + goto err1; + } + if (ssp->ss_create < fs->lfs_tstamp) { + DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset)); + offset = -1; + goto err1; + } + } + if (fs->lfs_version > 1) { + if (ssp->ss_serial != nextserial) { + DLOG((DLOG_RF, "Unexpected serial number at 0x%" PRIx64 + "\n", offset)); + offset = -1; + goto err1; + } + if (ssp->ss_ident != fs->lfs_ident) { + DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%" + PRIx64 "\n", ssp->ss_ident, fs->lfs_ident, offset)); + offset = -1; + goto err1; + } + } + if (pseg_flags) + *pseg_flags = ssp->ss_flags; + oldoffset = offset; + offset += btofsb(fs, fs->lfs_sumsize); + + ninos = howmany(ssp->ss_ninos, INOPB(fs)); + /* XXX ondisk32 */ + iaddr = (int32_t *)((char*)bp->b_data + fs->lfs_sumsize - sizeof(int32_t)); + if (flags & CHECK_CKSUM) { + /* Count blocks */ + nblocks = 0; + fip = (FINFO *)((char*)bp->b_data + SEGSUM_SIZE(fs)); + for (i = 0; i < ssp->ss_nfinfo; ++i) { + nblocks += fip->fi_nblocks; + if (fip->fi_nblocks <= 0) + break; + /* XXX ondisk32 */ + fip = (FINFO *)(((char *)fip) + FINFOSIZE + + (fip->fi_nblocks * sizeof(int32_t))); + } + nblocks += ninos; + /* Create the sum array */ + datap = dp = (u_long *)malloc(nblocks * sizeof(u_long), + M_SEGMENT, M_WAITOK); + } + + /* Handle individual blocks */ + fip = (FINFO *)((char*)bp->b_data + SEGSUM_SIZE(fs)); + for (i = 0; i < ssp->ss_nfinfo || ninos; ++i) { + /* Inode block? */ + if (ninos && *iaddr == offset) { + if (flags & CHECK_CKSUM) { + /* Read in the head and add to the buffer */ + error = bread(devvp, fsbtodb(fs, offset), fs->lfs_bsize, + cred, 0, &dbp); + if (error) { + offset = -1; + goto err2; + } + (*dp++) = ((u_long *)(dbp->b_data))[0]; + brelse(dbp, BC_AGE); + } + if (flags & CHECK_UPDATE) { + if ((error = update_inoblk(fs, offset, cred, l)) + != 0) { + offset = -1; + goto err2; + } + } + offset += btofsb(fs, fs->lfs_ibsize); + --iaddr; + --ninos; + --i; /* compensate */ + continue; + } + size = fs->lfs_bsize; + for (j = 0; j < fip->fi_nblocks; ++j) { + if (j == fip->fi_nblocks - 1) + size = fip->fi_lastlength; + if (flags & CHECK_CKSUM) { + error = bread(devvp, fsbtodb(fs, offset), size, + cred, 0, &dbp); + if (error) { + offset = -1; + goto err2; + } + (*dp++) = ((u_long *)(dbp->b_data))[0]; + brelse(dbp, BC_AGE); + } + /* Account for and update any direct blocks */ + if ((flags & CHECK_UPDATE) && + fip->fi_ino > LFS_IFILE_INUM && + fip->fi_blocks[j] >= 0) { + update_meta(fs, fip->fi_ino, fip->fi_version, + fip->fi_blocks[j], offset, size, l); + } + offset += btofsb(fs, size); + } + /* XXX ondisk32 */ + fip = (FINFO *)(((char *)fip) + FINFOSIZE + + fip->fi_nblocks * sizeof(int32_t)); + } + /* Checksum the array, compare */ + if ((flags & CHECK_CKSUM) && + ssp->ss_datasum != cksum(datap, nblocks * sizeof(u_long))) + { + DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64 + " (wanted %x got %x)\n", + offset, ssp->ss_datasum, cksum(datap, nblocks * + sizeof(u_long)))); + offset = -1; + goto err2; + } + + /* If we're at the end of the segment, move to the next */ + if (dtosn(fs, offset + btofsb(fs, fs->lfs_sumsize + fs->lfs_bsize)) != + dtosn(fs, offset)) { + if (dtosn(fs, offset) == dtosn(fs, ssp->ss_next)) { + offset = -1; + goto err2; + } + offset = ssp->ss_next; + DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64 + " -> segment %d\n", offset, dtosn(fs,offset))); + } + + if (flags & CHECK_UPDATE) { + fs->lfs_avail -= (offset - oldoffset); + /* Don't clog the buffer queue */ + mutex_enter(&lfs_lock); + if (locked_queue_count > LFS_MAX_BUFS || + locked_queue_bytes > LFS_MAX_BYTES) { + lfs_flush(fs, SEGM_CKP, 0); + } + mutex_exit(&lfs_lock); + } + + err2: + if (flags & CHECK_CKSUM) + free(datap, M_SEGMENT); + err1: + brelse(bp, BC_AGE); + + /* XXX should we update the serial number even for bad psegs? */ + if ((flags & CHECK_UPDATE) && offset > 0 && fs->lfs_version > 1) + fs->lfs_serial = nextserial; + return offset; +} + +void +lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l) +{ + int flags, dirty; + daddr_t offset, oldoffset, lastgoodpseg; + int sn, curseg, do_rollforward; + struct proc *p; + kauth_cred_t cred; + SEGUSE *sup; + struct buf *bp; + + p = l ? l->l_proc : NULL; + cred = p ? p->p_cred : NOCRED; + + /* + * Roll forward. + * + * We don't roll forward for v1 filesystems, because + * of the danger that the clock was turned back between the last + * checkpoint and crash. This would roll forward garbage. + * + * v2 filesystems don't have this problem because they use a + * monotonically increasing serial number instead of a timestamp. + */ + do_rollforward = (!(fs->lfs_pflags & LFS_PF_CLEAN) && + lfs_do_rfw && fs->lfs_version > 1 && p != NULL); + if (do_rollforward) { + u_int64_t nextserial; + /* + * Phase I: Find the address of the last good partial + * segment that was written after the checkpoint. Mark + * the segments in question dirty, so they won't be + * reallocated. + */ + lastgoodpseg = oldoffset = offset = fs->lfs_offset; + flags = 0x0; + DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%" + PRIx64 "\n", offset)); + LFS_SEGENTRY(sup, fs, dtosn(fs, offset), bp); + if (!(sup->su_flags & SEGUSE_DIRTY)) + --fs->lfs_nclean; + sup->su_flags |= SEGUSE_DIRTY; + LFS_WRITESEGENTRY(sup, fs, dtosn(fs, offset), bp); + nextserial = fs->lfs_serial + 1; + while ((offset = check_segsum(fs, offset, nextserial, + cred, CHECK_CKSUM, &flags, l)) > 0) { + nextserial++; + if (sntod(fs, oldoffset) != sntod(fs, offset)) { + LFS_SEGENTRY(sup, fs, dtosn(fs, oldoffset), + bp); + if (!(sup->su_flags & SEGUSE_DIRTY)) + --fs->lfs_nclean; + sup->su_flags |= SEGUSE_DIRTY; + LFS_WRITESEGENTRY(sup, fs, dtosn(fs, oldoffset), + bp); + } + + DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%" + PRIx64 "\n", offset)); + if (flags & SS_DIROP) { + DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%" + PRIx64 "\n", oldoffset)); + if (!(flags & SS_CONT)) { + DLOG((DLOG_RF, "lfs_mountfs: dirops end " + "at 0x%" PRIx64 "\n", oldoffset)); + } + } + if (!(flags & SS_CONT)) + lastgoodpseg = offset; + oldoffset = offset; + } + if (flags & SS_CONT) { + DLOG((DLOG_RF, "LFS roll forward: warning: incomplete " + "dirops discarded\n")); + } + DLOG((DLOG_RF, "LFS roll forward phase 1: completed: " + "lastgoodpseg=0x%" PRIx64 "\n", lastgoodpseg)); + oldoffset = fs->lfs_offset; + if (fs->lfs_offset != lastgoodpseg) { + /* Don't overwrite what we're trying to preserve */ + offset = fs->lfs_offset; + fs->lfs_offset = lastgoodpseg; + fs->lfs_curseg = sntod(fs, dtosn(fs, fs->lfs_offset)); + for (sn = curseg = dtosn(fs, fs->lfs_curseg);;) { + sn = (sn + 1) % fs->lfs_nseg; + if (sn == curseg) + panic("lfs_mountfs: no clean segments"); + LFS_SEGENTRY(sup, fs, sn, bp); + dirty = (sup->su_flags & SEGUSE_DIRTY); + brelse(bp, 0); + if (!dirty) + break; + } + fs->lfs_nextseg = sntod(fs, sn); + + /* + * Phase II: Roll forward from the first superblock. + */ + while (offset != lastgoodpseg) { + DLOG((DLOG_RF, "LFS roll forward phase 2: 0x%" + PRIx64 "\n", offset)); + offset = check_segsum(fs, offset, + fs->lfs_serial + 1, cred, CHECK_UPDATE, + NULL, l); + } + + /* + * Finish: flush our changes to disk. + */ + lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); + DLOG((DLOG_RF, "lfs_mountfs: roll forward ", + "recovered %lld blocks\n", + (long long)(lastgoodpseg - oldoffset))); + } + DLOG((DLOG_RF, "LFS roll forward complete\n")); + } +} diff --git a/sys/ufs/lfs/lfs_segment.c b/sys/ufs/lfs/lfs_segment.c new file mode 100644 index 000000000..aea143a5c --- /dev/null +++ b/sys/ufs/lfs/lfs_segment.c @@ -0,0 +1,2829 @@ +/* $NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $ */ + +/*- + * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Konrad E. Schroder . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_segment.c 8.10 (Berkeley) 6/10/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $"); + +#ifdef DEBUG +# define vndebug(vp, str) do { \ + if (VTOI(vp)->i_flag & IN_CLEANING) \ + DLOG((DLOG_WVNODE, "not writing ino %d because %s (op %d)\n", \ + VTOI(vp)->i_number, (str), op)); \ +} while(0) +#else +# define vndebug(vp, str) +#endif +#define ivndebug(vp, str) \ + DLOG((DLOG_WVNODE, "ino %d: %s\n", VTOI(vp)->i_number, (str))) + +#if defined(_KERNEL_OPT) +#include "opt_ddb.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +MALLOC_JUSTDEFINE(M_SEGMENT, "LFS segment", "Segment for LFS"); + +static void lfs_generic_callback(struct buf *, void (*)(struct buf *)); +static void lfs_free_aiodone(struct buf *); +static void lfs_super_aiodone(struct buf *); +static void lfs_cluster_aiodone(struct buf *); +static void lfs_cluster_callback(struct buf *); + +/* + * Determine if it's OK to start a partial in this segment, or if we need + * to go on to a new segment. + */ +#define LFS_PARTIAL_FITS(fs) \ + ((fs)->lfs_fsbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \ + (fs)->lfs_frag) + +/* + * Figure out whether we should do a checkpoint write or go ahead with + * an ordinary write. + */ +#define LFS_SHOULD_CHECKPOINT(fs, flags) \ + ((flags & SEGM_CLEAN) == 0 && \ + ((fs->lfs_nactive > LFS_MAX_ACTIVE || \ + (flags & SEGM_CKP) || \ + fs->lfs_nclean < LFS_MAX_ACTIVE))) + +int lfs_match_fake(struct lfs *, struct buf *); +void lfs_newseg(struct lfs *); +/* XXX ondisk32 */ +void lfs_shellsort(struct buf **, int32_t *, int, int); +void lfs_supercallback(struct buf *); +void lfs_updatemeta(struct segment *); +void lfs_writesuper(struct lfs *, daddr_t); +int lfs_writevnodes(struct lfs *fs, struct mount *mp, + struct segment *sp, int dirops); + +int lfs_allclean_wakeup; /* Cleaner wakeup address. */ +int lfs_writeindir = 1; /* whether to flush indir on non-ckp */ +int lfs_clean_vnhead = 0; /* Allow freeing to head of vn list */ +int lfs_dirvcount = 0; /* # active dirops */ + +/* Statistics Counters */ +int lfs_dostats = 1; +struct lfs_stats lfs_stats; + +/* op values to lfs_writevnodes */ +#define VN_REG 0 +#define VN_DIROP 1 +#define VN_EMPTY 2 +#define VN_CLEAN 3 + +/* + * XXX KS - Set modification time on the Ifile, so the cleaner can + * read the fs mod time off of it. We don't set IN_UPDATE here, + * since we don't really need this to be flushed to disk (and in any + * case that wouldn't happen to the Ifile until we checkpoint). + */ +void +lfs_imtime(struct lfs *fs) +{ + struct timespec ts; + struct inode *ip; + + ASSERT_MAYBE_SEGLOCK(fs); + vfs_timestamp(&ts); + ip = VTOI(fs->lfs_ivnode); + ip->i_ffs1_mtime = ts.tv_sec; + ip->i_ffs1_mtimensec = ts.tv_nsec; +} + +/* + * Ifile and meta data blocks are not marked busy, so segment writes MUST be + * single threaded. Currently, there are two paths into lfs_segwrite, sync() + * and getnewbuf(). They both mark the file system busy. Lfs_vflush() + * explicitly marks the file system busy. So lfs_segwrite is safe. I think. + */ + +#define IS_FLUSHING(fs,vp) ((fs)->lfs_flushvp == (vp)) + +int +lfs_vflush(struct vnode *vp) +{ + struct inode *ip; + struct lfs *fs; + struct segment *sp; + struct buf *bp, *nbp, *tbp, *tnbp; + int error; + int flushed; + int relock; + int loopcount; + + ip = VTOI(vp); + fs = VFSTOUFS(vp->v_mount)->um_lfs; + relock = 0; + + top: + ASSERT_NO_SEGLOCK(fs); + if (ip->i_flag & IN_CLEANING) { + ivndebug(vp,"vflush/in_cleaning"); + mutex_enter(&lfs_lock); + LFS_CLR_UINO(ip, IN_CLEANING); + LFS_SET_UINO(ip, IN_MODIFIED); + mutex_exit(&lfs_lock); + + /* + * Toss any cleaning buffers that have real counterparts + * to avoid losing new data. + */ + mutex_enter(vp->v_interlock); + for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); + if (!LFS_IS_MALLOC_BUF(bp)) + continue; + /* + * Look for pages matching the range covered + * by cleaning blocks. It's okay if more dirty + * pages appear, so long as none disappear out + * from under us. + */ + if (bp->b_lblkno > 0 && vp->v_type == VREG && + vp != fs->lfs_ivnode) { + struct vm_page *pg; + voff_t off; + + for (off = lblktosize(fs, bp->b_lblkno); + off < lblktosize(fs, bp->b_lblkno + 1); + off += PAGE_SIZE) { + pg = uvm_pagelookup(&vp->v_uobj, off); + if (pg == NULL) + continue; + if ((pg->flags & PG_CLEAN) == 0 || + pmap_is_modified(pg)) { + fs->lfs_avail += btofsb(fs, + bp->b_bcount); + wakeup(&fs->lfs_avail); + mutex_exit(vp->v_interlock); + lfs_freebuf(fs, bp); + mutex_enter(vp->v_interlock); + bp = NULL; + break; + } + } + } + for (tbp = LIST_FIRST(&vp->v_dirtyblkhd); tbp; + tbp = tnbp) + { + tnbp = LIST_NEXT(tbp, b_vnbufs); + if (tbp->b_vp == bp->b_vp + && tbp->b_lblkno == bp->b_lblkno + && tbp != bp) + { + fs->lfs_avail += btofsb(fs, + bp->b_bcount); + wakeup(&fs->lfs_avail); + mutex_exit(vp->v_interlock); + lfs_freebuf(fs, bp); + mutex_enter(vp->v_interlock); + bp = NULL; + break; + } + } + } + } else { + mutex_enter(vp->v_interlock); + } + + /* If the node is being written, wait until that is done */ + while (WRITEINPROG(vp)) { + ivndebug(vp,"vflush/writeinprog"); + cv_wait(&vp->v_cv, vp->v_interlock); + } + mutex_exit(vp->v_interlock); + + /* Protect against VI_XLOCK deadlock in vinvalbuf() */ + lfs_seglock(fs, SEGM_SYNC); + + /* If we're supposed to flush a freed inode, just toss it */ + if (ip->i_lfs_iflags & LFSI_DELETED) { + DLOG((DLOG_VNODE, "lfs_vflush: ino %d freed, not flushing\n", + ip->i_number)); + /* Drain v_numoutput */ + mutex_enter(vp->v_interlock); + while (vp->v_numoutput > 0) { + cv_wait(&vp->v_cv, vp->v_interlock); + } + KASSERT(vp->v_numoutput == 0); + mutex_exit(vp->v_interlock); + + mutex_enter(&bufcache_lock); + for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); + + KASSERT((bp->b_flags & B_GATHERED) == 0); + if (bp->b_oflags & BO_DELWRI) { /* XXX always true? */ + fs->lfs_avail += btofsb(fs, bp->b_bcount); + wakeup(&fs->lfs_avail); + } + /* Copied from lfs_writeseg */ + if (bp->b_iodone != NULL) { + mutex_exit(&bufcache_lock); + biodone(bp); + mutex_enter(&bufcache_lock); + } else { + bremfree(bp); + LFS_UNLOCK_BUF(bp); + mutex_enter(vp->v_interlock); + bp->b_flags &= ~(B_READ | B_GATHERED); + bp->b_oflags = (bp->b_oflags & ~BO_DELWRI) | BO_DONE; + bp->b_error = 0; + reassignbuf(bp, vp); + mutex_exit(vp->v_interlock); + brelse(bp, 0); + } + } + mutex_exit(&bufcache_lock); + LFS_CLR_UINO(ip, IN_CLEANING); + LFS_CLR_UINO(ip, IN_MODIFIED | IN_ACCESSED); + ip->i_flag &= ~IN_ALLMOD; + DLOG((DLOG_VNODE, "lfs_vflush: done not flushing ino %d\n", + ip->i_number)); + lfs_segunlock(fs); + + KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); + + return 0; + } + + fs->lfs_flushvp = vp; + if (LFS_SHOULD_CHECKPOINT(fs, fs->lfs_sp->seg_flags)) { + error = lfs_segwrite(vp->v_mount, SEGM_CKP | SEGM_SYNC); + fs->lfs_flushvp = NULL; + KASSERT(fs->lfs_flushvp_fakevref == 0); + lfs_segunlock(fs); + + /* Make sure that any pending buffers get written */ + mutex_enter(vp->v_interlock); + while (vp->v_numoutput > 0) { + cv_wait(&vp->v_cv, vp->v_interlock); + } + KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); + KASSERT(vp->v_numoutput == 0); + mutex_exit(vp->v_interlock); + + return error; + } + sp = fs->lfs_sp; + + flushed = 0; + if (VPISEMPTY(vp)) { + lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY); + ++flushed; + } else if ((ip->i_flag & IN_CLEANING) && + (fs->lfs_sp->seg_flags & SEGM_CLEAN)) { + ivndebug(vp,"vflush/clean"); + lfs_writevnodes(fs, vp->v_mount, sp, VN_CLEAN); + ++flushed; + } else if (lfs_dostats) { + if (!VPISEMPTY(vp) || (VTOI(vp)->i_flag & IN_ALLMOD)) + ++lfs_stats.vflush_invoked; + ivndebug(vp,"vflush"); + } + +#ifdef DIAGNOSTIC + if (vp->v_uflag & VU_DIROP) { + DLOG((DLOG_VNODE, "lfs_vflush: flushing VU_DIROP\n")); + /* panic("lfs_vflush: VU_DIROP being flushed...this can\'t happen"); */ + } +#endif + + do { + loopcount = 0; + do { + if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) { + relock = lfs_writefile(fs, sp, vp); + if (relock) { + /* + * Might have to wait for the + * cleaner to run; but we're + * still not done with this vnode. + */ + KDASSERT(ip->i_number != LFS_IFILE_INUM); + lfs_writeinode(fs, sp, ip); + mutex_enter(&lfs_lock); + LFS_SET_UINO(ip, IN_MODIFIED); + mutex_exit(&lfs_lock); + lfs_writeseg(fs, sp); + lfs_segunlock(fs); + lfs_segunlock_relock(fs); + goto top; + } + } + /* + * If we begin a new segment in the middle of writing + * the Ifile, it creates an inconsistent checkpoint, + * since the Ifile information for the new segment + * is not up-to-date. Take care of this here by + * sending the Ifile through again in case there + * are newly dirtied blocks. But wait, there's more! + * This second Ifile write could *also* cross a segment + * boundary, if the first one was large. The second + * one is guaranteed to be no more than 8 blocks, + * though (two segment blocks and supporting indirects) + * so the third write *will not* cross the boundary. + */ + if (vp == fs->lfs_ivnode) { + lfs_writefile(fs, sp, vp); + lfs_writefile(fs, sp, vp); + } +#ifdef DEBUG + if (++loopcount > 2) + log(LOG_NOTICE, "lfs_vflush: looping count=%d\n", loopcount); +#endif + } while (lfs_writeinode(fs, sp, ip)); + } while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM); + + if (lfs_dostats) { + ++lfs_stats.nwrites; + if (sp->seg_flags & SEGM_SYNC) + ++lfs_stats.nsync_writes; + if (sp->seg_flags & SEGM_CKP) + ++lfs_stats.ncheckpoints; + } + /* + * If we were called from somewhere that has already held the seglock + * (e.g., lfs_markv()), the lfs_segunlock will not wait for + * the write to complete because we are still locked. + * Since lfs_vflush() must return the vnode with no dirty buffers, + * we must explicitly wait, if that is the case. + * + * We compare the iocount against 1, not 0, because it is + * artificially incremented by lfs_seglock(). + */ + mutex_enter(&lfs_lock); + if (fs->lfs_seglock > 1) { + while (fs->lfs_iocount > 1) + (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1, + "lfs_vflush", 0, &lfs_lock); + } + mutex_exit(&lfs_lock); + + lfs_segunlock(fs); + + /* Wait for these buffers to be recovered by aiodoned */ + mutex_enter(vp->v_interlock); + while (vp->v_numoutput > 0) { + cv_wait(&vp->v_cv, vp->v_interlock); + } + KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); + KASSERT(vp->v_numoutput == 0); + mutex_exit(vp->v_interlock); + + fs->lfs_flushvp = NULL; + KASSERT(fs->lfs_flushvp_fakevref == 0); + + return (0); +} + +int +lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op) +{ + struct inode *ip; + struct vnode *vp; + int inodes_written = 0, only_cleaning; + int error = 0; + + ASSERT_SEGLOCK(fs); + loop: + /* start at last (newest) vnode. */ + mutex_enter(&mntvnode_lock); + TAILQ_FOREACH_REVERSE(vp, &mp->mnt_vnodelist, vnodelst, v_mntvnodes) { + /* + * If the vnode that we are about to sync is no longer + * associated with this mount point, start over. + */ + if (vp->v_mount != mp) { + DLOG((DLOG_VNODE, "lfs_writevnodes: starting over\n")); + /* + * After this, pages might be busy + * due to our own previous putpages. + * Start actual segment write here to avoid deadlock. + */ + mutex_exit(&mntvnode_lock); + (void)lfs_writeseg(fs, sp); + goto loop; + } + + mutex_enter(vp->v_interlock); + if (vp->v_type == VNON || vismarker(vp) || + (vp->v_iflag & VI_CLEAN) != 0) { + mutex_exit(vp->v_interlock); + continue; + } + + ip = VTOI(vp); + if ((op == VN_DIROP && !(vp->v_uflag & VU_DIROP)) || + (op != VN_DIROP && op != VN_CLEAN && + (vp->v_uflag & VU_DIROP))) { + mutex_exit(vp->v_interlock); + vndebug(vp,"dirop"); + continue; + } + + if (op == VN_EMPTY && !VPISEMPTY(vp)) { + mutex_exit(vp->v_interlock); + vndebug(vp,"empty"); + continue; + } + + if (op == VN_CLEAN && ip->i_number != LFS_IFILE_INUM + && vp != fs->lfs_flushvp + && !(ip->i_flag & IN_CLEANING)) { + mutex_exit(vp->v_interlock); + vndebug(vp,"cleaning"); + continue; + } + + mutex_exit(&mntvnode_lock); + if (lfs_vref(vp)) { + vndebug(vp,"vref"); + mutex_enter(&mntvnode_lock); + continue; + } + + only_cleaning = 0; + /* + * Write the inode/file if dirty and it's not the IFILE. + */ + if ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp)) { + only_cleaning = + ((ip->i_flag & IN_ALLMOD) == IN_CLEANING); + + if (ip->i_number != LFS_IFILE_INUM) { + error = lfs_writefile(fs, sp, vp); + if (error) { + lfs_vunref(vp); + if (error == EAGAIN) { + /* + * This error from lfs_putpages + * indicates we need to drop + * the segment lock and start + * over after the cleaner has + * had a chance to run. + */ + lfs_writeinode(fs, sp, ip); + lfs_writeseg(fs, sp); + if (!VPISEMPTY(vp) && + !WRITEINPROG(vp) && + !(ip->i_flag & IN_ALLMOD)) { + mutex_enter(&lfs_lock); + LFS_SET_UINO(ip, IN_MODIFIED); + mutex_exit(&lfs_lock); + } + mutex_enter(&mntvnode_lock); + break; + } + error = 0; /* XXX not quite right */ + mutex_enter(&mntvnode_lock); + continue; + } + + if (!VPISEMPTY(vp)) { + if (WRITEINPROG(vp)) { + ivndebug(vp,"writevnodes/write2"); + } else if (!(ip->i_flag & IN_ALLMOD)) { + mutex_enter(&lfs_lock); + LFS_SET_UINO(ip, IN_MODIFIED); + mutex_exit(&lfs_lock); + } + } + (void) lfs_writeinode(fs, sp, ip); + inodes_written++; + } + } + + if (lfs_clean_vnhead && only_cleaning) + lfs_vunref_head(vp); + else + lfs_vunref(vp); + + mutex_enter(&mntvnode_lock); + } + mutex_exit(&mntvnode_lock); + return error; +} + +/* + * Do a checkpoint. + */ +int +lfs_segwrite(struct mount *mp, int flags) +{ + struct buf *bp; + struct inode *ip; + struct lfs *fs; + struct segment *sp; + struct vnode *vp; + SEGUSE *segusep; + int do_ckp, did_ckp, error; + unsigned n, segleft, maxseg, sn, i, curseg; + int writer_set = 0; + int dirty; + int redo; + int um_error; + int loopcount; + + fs = VFSTOUFS(mp)->um_lfs; + ASSERT_MAYBE_SEGLOCK(fs); + + if (fs->lfs_ronly) + return EROFS; + + lfs_imtime(fs); + + /* + * Allocate a segment structure and enough space to hold pointers to + * the maximum possible number of buffers which can be described in a + * single summary block. + */ + do_ckp = LFS_SHOULD_CHECKPOINT(fs, flags); + + lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0)); + sp = fs->lfs_sp; + if (sp->seg_flags & (SEGM_CLEAN | SEGM_CKP)) + do_ckp = 1; + + /* + * If lfs_flushvp is non-NULL, we are called from lfs_vflush, + * in which case we have to flush *all* buffers off of this vnode. + * We don't care about other nodes, but write any non-dirop nodes + * anyway in anticipation of another getnewvnode(). + * + * If we're cleaning we only write cleaning and ifile blocks, and + * no dirops, since otherwise we'd risk corruption in a crash. + */ + if (sp->seg_flags & SEGM_CLEAN) + lfs_writevnodes(fs, mp, sp, VN_CLEAN); + else if (!(sp->seg_flags & SEGM_FORCE_CKP)) { + do { + um_error = lfs_writevnodes(fs, mp, sp, VN_REG); + + if (do_ckp || fs->lfs_dirops == 0) { + if (!writer_set) { + lfs_writer_enter(fs, "lfs writer"); + writer_set = 1; + } + error = lfs_writevnodes(fs, mp, sp, VN_DIROP); + if (um_error == 0) + um_error = error; + /* In case writevnodes errored out */ + lfs_flush_dirops(fs); + ((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT); + lfs_finalize_fs_seguse(fs); + } + if (do_ckp && um_error) { + lfs_segunlock_relock(fs); + sp = fs->lfs_sp; + } + } while (do_ckp && um_error != 0); + } + + /* + * If we are doing a checkpoint, mark everything since the + * last checkpoint as no longer ACTIVE. + */ + if (do_ckp || fs->lfs_doifile) { + segleft = fs->lfs_nseg; + curseg = 0; + for (n = 0; n < fs->lfs_segtabsz; n++) { + dirty = 0; + if (bread(fs->lfs_ivnode, fs->lfs_cleansz + n, + fs->lfs_bsize, NOCRED, B_MODIFY, &bp)) + panic("lfs_segwrite: ifile read"); + segusep = (SEGUSE *)bp->b_data; + maxseg = min(segleft, fs->lfs_sepb); + for (i = 0; i < maxseg; i++) { + sn = curseg + i; + if (sn != dtosn(fs, fs->lfs_curseg) && + segusep->su_flags & SEGUSE_ACTIVE) { + segusep->su_flags &= ~SEGUSE_ACTIVE; + --fs->lfs_nactive; + ++dirty; + } + fs->lfs_suflags[fs->lfs_activesb][sn] = + segusep->su_flags; + if (fs->lfs_version > 1) + ++segusep; + else + segusep = (SEGUSE *) + ((SEGUSE_V1 *)segusep + 1); + } + + if (dirty) + error = LFS_BWRITE_LOG(bp); /* Ifile */ + else + brelse(bp, 0); + segleft -= fs->lfs_sepb; + curseg += fs->lfs_sepb; + } + } + + KASSERT(LFS_SEGLOCK_HELD(fs)); + + did_ckp = 0; + if (do_ckp || fs->lfs_doifile) { + vp = fs->lfs_ivnode; + vn_lock(vp, LK_EXCLUSIVE); + loopcount = 0; + do { +#ifdef DEBUG + LFS_ENTER_LOG("pretend", __FILE__, __LINE__, 0, 0, curproc->p_pid); +#endif + mutex_enter(&lfs_lock); + fs->lfs_flags &= ~LFS_IFDIRTY; + mutex_exit(&lfs_lock); + + ip = VTOI(vp); + + if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) { + /* + * Ifile has no pages, so we don't need + * to check error return here. + */ + lfs_writefile(fs, sp, vp); + /* + * Ensure the Ifile takes the current segment + * into account. See comment in lfs_vflush. + */ + lfs_writefile(fs, sp, vp); + lfs_writefile(fs, sp, vp); + } + + if (ip->i_flag & IN_ALLMOD) + ++did_ckp; +#if 0 + redo = (do_ckp ? lfs_writeinode(fs, sp, ip) : 0); +#else + redo = lfs_writeinode(fs, sp, ip); +#endif + redo += lfs_writeseg(fs, sp); + mutex_enter(&lfs_lock); + redo += (fs->lfs_flags & LFS_IFDIRTY); + mutex_exit(&lfs_lock); +#ifdef DEBUG + if (++loopcount > 2) + log(LOG_NOTICE, "lfs_segwrite: looping count=%d\n", + loopcount); +#endif + } while (redo && do_ckp); + + /* + * Unless we are unmounting, the Ifile may continue to have + * dirty blocks even after a checkpoint, due to changes to + * inodes' atime. If we're checkpointing, it's "impossible" + * for other parts of the Ifile to be dirty after the loop + * above, since we hold the segment lock. + */ + mutex_enter(vp->v_interlock); + if (LIST_EMPTY(&vp->v_dirtyblkhd)) { + LFS_CLR_UINO(ip, IN_ALLMOD); + } +#ifdef DIAGNOSTIC + else if (do_ckp) { + int do_panic = 0; + LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { + if (bp->b_lblkno < fs->lfs_cleansz + + fs->lfs_segtabsz && + !(bp->b_flags & B_GATHERED)) { + printf("ifile lbn %ld still dirty (flags %lx)\n", + (long)bp->b_lblkno, + (long)bp->b_flags); + ++do_panic; + } + } + if (do_panic) + panic("dirty blocks"); + } +#endif + mutex_exit(vp->v_interlock); + VOP_UNLOCK(vp); + } else { + (void) lfs_writeseg(fs, sp); + } + + /* Note Ifile no longer needs to be written */ + fs->lfs_doifile = 0; + if (writer_set) + lfs_writer_leave(fs); + + /* + * If we didn't write the Ifile, we didn't really do anything. + * That means that (1) there is a checkpoint on disk and (2) + * nothing has changed since it was written. + * + * Take the flags off of the segment so that lfs_segunlock + * doesn't have to write the superblock either. + */ + if (do_ckp && !did_ckp) { + sp->seg_flags &= ~SEGM_CKP; + } + + if (lfs_dostats) { + ++lfs_stats.nwrites; + if (sp->seg_flags & SEGM_SYNC) + ++lfs_stats.nsync_writes; + if (sp->seg_flags & SEGM_CKP) + ++lfs_stats.ncheckpoints; + } + lfs_segunlock(fs); + return (0); +} + +/* + * Write the dirty blocks associated with a vnode. + */ +int +lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp) +{ + struct finfo *fip; + struct inode *ip; + int i, frag; + int error; + + ASSERT_SEGLOCK(fs); + error = 0; + ip = VTOI(vp); + + fip = sp->fip; + lfs_acquire_finfo(fs, ip->i_number, ip->i_gen); + + if (vp->v_uflag & VU_DIROP) + ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT); + + if (sp->seg_flags & SEGM_CLEAN) { + lfs_gather(fs, sp, vp, lfs_match_fake); + /* + * For a file being flushed, we need to write *all* blocks. + * This means writing the cleaning blocks first, and then + * immediately following with any non-cleaning blocks. + * The same is true of the Ifile since checkpoints assume + * that all valid Ifile blocks are written. + */ + if (IS_FLUSHING(fs, vp) || vp == fs->lfs_ivnode) { + lfs_gather(fs, sp, vp, lfs_match_data); + /* + * Don't call VOP_PUTPAGES: if we're flushing, + * we've already done it, and the Ifile doesn't + * use the page cache. + */ + } + } else { + lfs_gather(fs, sp, vp, lfs_match_data); + /* + * If we're flushing, we've already called VOP_PUTPAGES + * so don't do it again. Otherwise, we want to write + * everything we've got. + */ + if (!IS_FLUSHING(fs, vp)) { + mutex_enter(vp->v_interlock); + error = VOP_PUTPAGES(vp, 0, 0, + PGO_CLEANIT | PGO_ALLPAGES | PGO_LOCKED); + } + } + + /* + * It may not be necessary to write the meta-data blocks at this point, + * as the roll-forward recovery code should be able to reconstruct the + * list. + * + * We have to write them anyway, though, under two conditions: (1) the + * vnode is being flushed (for reuse by vinvalbuf); or (2) we are + * checkpointing. + * + * BUT if we are cleaning, we might have indirect blocks that refer to + * new blocks not being written yet, in addition to fragments being + * moved out of a cleaned segment. If that is the case, don't + * write the indirect blocks, or the finfo will have a small block + * in the middle of it! + * XXX in this case isn't the inode size wrong too? + */ + frag = 0; + if (sp->seg_flags & SEGM_CLEAN) { + for (i = 0; i < NDADDR; i++) + if (ip->i_lfs_fragsize[i] > 0 && + ip->i_lfs_fragsize[i] < fs->lfs_bsize) + ++frag; + } +#ifdef DIAGNOSTIC + if (frag > 1) + panic("lfs_writefile: more than one fragment!"); +#endif + if (IS_FLUSHING(fs, vp) || + (frag == 0 && (lfs_writeindir || (sp->seg_flags & SEGM_CKP)))) { + lfs_gather(fs, sp, vp, lfs_match_indir); + lfs_gather(fs, sp, vp, lfs_match_dindir); + lfs_gather(fs, sp, vp, lfs_match_tindir); + } + fip = sp->fip; + lfs_release_finfo(fs); + + return error; +} + +/* + * Update segment accounting to reflect this inode's change of address. + */ +static int +lfs_update_iaddr(struct lfs *fs, struct segment *sp, struct inode *ip, daddr_t ndaddr) +{ + struct buf *bp; + daddr_t daddr; + IFILE *ifp; + SEGUSE *sup; + ino_t ino; + int redo_ifile, error; + u_int32_t sn; + + redo_ifile = 0; + + /* + * If updating the ifile, update the super-block. Update the disk + * address and access times for this inode in the ifile. + */ + ino = ip->i_number; + if (ino == LFS_IFILE_INUM) { + daddr = fs->lfs_idaddr; + fs->lfs_idaddr = dbtofsb(fs, ndaddr); + } else { + LFS_IENTRY(ifp, fs, ino, bp); + daddr = ifp->if_daddr; + ifp->if_daddr = dbtofsb(fs, ndaddr); + error = LFS_BWRITE_LOG(bp); /* Ifile */ + } + + /* + * If this is the Ifile and lfs_offset is set to the first block + * in the segment, dirty the new segment's accounting block + * (XXX should already be dirty?) and tell the caller to do it again. + */ + if (ip->i_number == LFS_IFILE_INUM) { + sn = dtosn(fs, fs->lfs_offset); + if (sntod(fs, sn) + btofsb(fs, fs->lfs_sumsize) == + fs->lfs_offset) { + LFS_SEGENTRY(sup, fs, sn, bp); + KASSERT(bp->b_oflags & BO_DELWRI); + LFS_WRITESEGENTRY(sup, fs, sn, bp); + /* fs->lfs_flags |= LFS_IFDIRTY; */ + redo_ifile |= 1; + } + } + + /* + * The inode's last address should not be in the current partial + * segment, except under exceptional circumstances (lfs_writevnodes + * had to start over, and in the meantime more blocks were written + * to a vnode). Both inodes will be accounted to this segment + * in lfs_writeseg so we need to subtract the earlier version + * here anyway. The segment count can temporarily dip below + * zero here; keep track of how many duplicates we have in + * "dupino" so we don't panic below. + */ + if (daddr >= fs->lfs_lastpseg && daddr <= fs->lfs_offset) { + ++sp->ndupino; + DLOG((DLOG_SEG, "lfs_writeinode: last inode addr in current pseg " + "(ino %d daddr 0x%llx) ndupino=%d\n", ino, + (long long)daddr, sp->ndupino)); + } + /* + * Account the inode: it no longer belongs to its former segment, + * though it will not belong to the new segment until that segment + * is actually written. + */ + if (daddr != LFS_UNUSED_DADDR) { + u_int32_t oldsn = dtosn(fs, daddr); +#ifdef DIAGNOSTIC + int ndupino = (sp->seg_number == oldsn) ? sp->ndupino : 0; +#endif + LFS_SEGENTRY(sup, fs, oldsn, bp); +#ifdef DIAGNOSTIC + if (sup->su_nbytes + + sizeof (struct ufs1_dinode) * ndupino + < sizeof (struct ufs1_dinode)) { + printf("lfs_writeinode: negative bytes " + "(segment %" PRIu32 " short by %d, " + "oldsn=%" PRIu32 ", cursn=%" PRIu32 + ", daddr=%" PRId64 ", su_nbytes=%u, " + "ndupino=%d)\n", + dtosn(fs, daddr), + (int)sizeof (struct ufs1_dinode) * + (1 - sp->ndupino) - sup->su_nbytes, + oldsn, sp->seg_number, daddr, + (unsigned int)sup->su_nbytes, + sp->ndupino); + panic("lfs_writeinode: negative bytes"); + sup->su_nbytes = sizeof (struct ufs1_dinode); + } +#endif + DLOG((DLOG_SU, "seg %d -= %d for ino %d inode\n", + dtosn(fs, daddr), sizeof (struct ufs1_dinode), ino)); + sup->su_nbytes -= sizeof (struct ufs1_dinode); + redo_ifile |= + (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED)); + if (redo_ifile) { + mutex_enter(&lfs_lock); + fs->lfs_flags |= LFS_IFDIRTY; + mutex_exit(&lfs_lock); + /* Don't double-account */ + fs->lfs_idaddr = 0x0; + } + LFS_WRITESEGENTRY(sup, fs, oldsn, bp); /* Ifile */ + } + + return redo_ifile; +} + +int +lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip) +{ + struct buf *bp; + struct ufs1_dinode *cdp; + daddr_t daddr; + int32_t *daddrp; /* XXX ondisk32 */ + int i, ndx; + int redo_ifile = 0; + int gotblk = 0; + int count; + + ASSERT_SEGLOCK(fs); + if (!(ip->i_flag & IN_ALLMOD)) + return (0); + + /* Can't write ifile when writer is not set */ + KASSERT(ip->i_number != LFS_IFILE_INUM || fs->lfs_writer > 0 || + (sp->seg_flags & SEGM_CLEAN)); + + /* + * If this is the Ifile, see if writing it here will generate a + * temporary misaccounting. If it will, do the accounting and write + * the blocks, postponing the inode write until the accounting is + * solid. + */ + count = 0; + while (ip->i_number == LFS_IFILE_INUM) { + int redo = 0; + + if (sp->idp == NULL && sp->ibp == NULL && + (sp->seg_bytes_left < fs->lfs_ibsize || + sp->sum_bytes_left < sizeof(int32_t))) { + (void) lfs_writeseg(fs, sp); + continue; + } + + /* Look for dirty Ifile blocks */ + LIST_FOREACH(bp, &fs->lfs_ivnode->v_dirtyblkhd, b_vnbufs) { + if (!(bp->b_flags & B_GATHERED)) { + redo = 1; + break; + } + } + + if (redo == 0) + redo = lfs_update_iaddr(fs, sp, ip, 0x0); + if (redo == 0) + break; + + if (sp->idp) { + sp->idp->di_inumber = 0; + sp->idp = NULL; + } + ++count; + if (count > 2) + log(LOG_NOTICE, "lfs_writeinode: looping count=%d\n", count); + lfs_writefile(fs, sp, fs->lfs_ivnode); + } + + /* Allocate a new inode block if necessary. */ + if ((ip->i_number != LFS_IFILE_INUM || sp->idp == NULL) && + sp->ibp == NULL) { + /* Allocate a new segment if necessary. */ + if (sp->seg_bytes_left < fs->lfs_ibsize || + sp->sum_bytes_left < sizeof(int32_t)) + (void) lfs_writeseg(fs, sp); + + /* Get next inode block. */ + daddr = fs->lfs_offset; + fs->lfs_offset += btofsb(fs, fs->lfs_ibsize); + sp->ibp = *sp->cbpp++ = + getblk(VTOI(fs->lfs_ivnode)->i_devvp, + fsbtodb(fs, daddr), fs->lfs_ibsize, 0, 0); + gotblk++; + + /* Zero out inode numbers */ + for (i = 0; i < INOPB(fs); ++i) + ((struct ufs1_dinode *)sp->ibp->b_data)[i].di_inumber = + 0; + + ++sp->start_bpp; + fs->lfs_avail -= btofsb(fs, fs->lfs_ibsize); + /* Set remaining space counters. */ + sp->seg_bytes_left -= fs->lfs_ibsize; + sp->sum_bytes_left -= sizeof(int32_t); + ndx = fs->lfs_sumsize / sizeof(int32_t) - + sp->ninodes / INOPB(fs) - 1; + ((int32_t *)(sp->segsum))[ndx] = daddr; + } + + /* Check VU_DIROP in case there is a new file with no data blocks */ + if (ITOV(ip)->v_uflag & VU_DIROP) + ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT); + + /* Update the inode times and copy the inode onto the inode page. */ + /* XXX kludge --- don't redirty the ifile just to put times on it */ + if (ip->i_number != LFS_IFILE_INUM) + LFS_ITIMES(ip, NULL, NULL, NULL); + + /* + * If this is the Ifile, and we've already written the Ifile in this + * partial segment, just overwrite it (it's not on disk yet) and + * continue. + * + * XXX we know that the bp that we get the second time around has + * already been gathered. + */ + if (ip->i_number == LFS_IFILE_INUM && sp->idp) { + *(sp->idp) = *ip->i_din.ffs1_din; + ip->i_lfs_osize = ip->i_size; + return 0; + } + + bp = sp->ibp; + cdp = ((struct ufs1_dinode *)bp->b_data) + (sp->ninodes % INOPB(fs)); + *cdp = *ip->i_din.ffs1_din; + + /* + * If cleaning, link counts and directory file sizes cannot change, + * since those would be directory operations---even if the file + * we are writing is marked VU_DIROP we should write the old values. + * If we're not cleaning, of course, update the values so we get + * current values the next time we clean. + */ + if (sp->seg_flags & SEGM_CLEAN) { + if (ITOV(ip)->v_uflag & VU_DIROP) { + cdp->di_nlink = ip->i_lfs_odnlink; + /* if (ITOV(ip)->v_type == VDIR) */ + cdp->di_size = ip->i_lfs_osize; + } + } else { + ip->i_lfs_odnlink = cdp->di_nlink; + ip->i_lfs_osize = ip->i_size; + } + + + /* We can finish the segment accounting for truncations now */ + lfs_finalize_ino_seguse(fs, ip); + + /* + * If we are cleaning, ensure that we don't write UNWRITTEN disk + * addresses to disk; possibly change the on-disk record of + * the inode size, either by reverting to the previous size + * (in the case of cleaning) or by verifying the inode's block + * holdings (in the case of files being allocated as they are being + * written). + * XXX By not writing UNWRITTEN blocks, we are making the lfs_avail + * XXX count on disk wrong by the same amount. We should be + * XXX able to "borrow" from lfs_avail and return it after the + * XXX Ifile is written. See also in lfs_writeseg. + */ + + /* Check file size based on highest allocated block */ + if (((ip->i_ffs1_mode & IFMT) == IFREG || + (ip->i_ffs1_mode & IFMT) == IFDIR) && + ip->i_size > ((ip->i_lfs_hiblk + 1) << fs->lfs_bshift)) { + cdp->di_size = (ip->i_lfs_hiblk + 1) << fs->lfs_bshift; + DLOG((DLOG_SEG, "lfs_writeinode: ino %d size %" PRId64 " -> %" + PRId64 "\n", (int)ip->i_number, ip->i_size, cdp->di_size)); + } + if (ip->i_lfs_effnblks != ip->i_ffs1_blocks) { + DLOG((DLOG_SEG, "lfs_writeinode: cleansing ino %d eff %d != nblk %d)" + " at %x\n", ip->i_number, ip->i_lfs_effnblks, + ip->i_ffs1_blocks, fs->lfs_offset)); + for (daddrp = cdp->di_db; daddrp < cdp->di_ib + NIADDR; + daddrp++) { + if (*daddrp == UNWRITTEN) { + DLOG((DLOG_SEG, "lfs_writeinode: wiping UNWRITTEN\n")); + *daddrp = 0; + } + } + } + +#ifdef DIAGNOSTIC + /* + * Check dinode held blocks against dinode size. + * This should be identical to the check in lfs_vget(). + */ + for (i = (cdp->di_size + fs->lfs_bsize - 1) >> fs->lfs_bshift; + i < NDADDR; i++) { + KASSERT(i >= 0); + if ((cdp->di_mode & IFMT) == IFLNK) + continue; + if (((cdp->di_mode & IFMT) == IFBLK || + (cdp->di_mode & IFMT) == IFCHR) && i == 0) + continue; + if (cdp->di_db[i] != 0) { +# ifdef DEBUG + lfs_dump_dinode(cdp); +# endif + panic("writing inconsistent inode"); + } + } +#endif /* DIAGNOSTIC */ + + if (ip->i_flag & IN_CLEANING) + LFS_CLR_UINO(ip, IN_CLEANING); + else { + /* XXX IN_ALLMOD */ + LFS_CLR_UINO(ip, IN_ACCESSED | IN_ACCESS | IN_CHANGE | + IN_UPDATE | IN_MODIFY); + if (ip->i_lfs_effnblks == ip->i_ffs1_blocks) + LFS_CLR_UINO(ip, IN_MODIFIED); + else { + DLOG((DLOG_VNODE, "lfs_writeinode: ino %d: real " + "blks=%d, eff=%d\n", ip->i_number, + ip->i_ffs1_blocks, ip->i_lfs_effnblks)); + } + } + + if (ip->i_number == LFS_IFILE_INUM) { + /* We know sp->idp == NULL */ + sp->idp = ((struct ufs1_dinode *)bp->b_data) + + (sp->ninodes % INOPB(fs)); + + /* Not dirty any more */ + mutex_enter(&lfs_lock); + fs->lfs_flags &= ~LFS_IFDIRTY; + mutex_exit(&lfs_lock); + } + + if (gotblk) { + mutex_enter(&bufcache_lock); + LFS_LOCK_BUF(bp); + brelsel(bp, 0); + mutex_exit(&bufcache_lock); + } + + /* Increment inode count in segment summary block. */ + ++((SEGSUM *)(sp->segsum))->ss_ninos; + + /* If this page is full, set flag to allocate a new page. */ + if (++sp->ninodes % INOPB(fs) == 0) + sp->ibp = NULL; + + redo_ifile = lfs_update_iaddr(fs, sp, ip, bp->b_blkno); + + KASSERT(redo_ifile == 0); + return (redo_ifile); +} + +int +lfs_gatherblock(struct segment *sp, struct buf *bp, kmutex_t *mptr) +{ + struct lfs *fs; + int vers; + int j, blksinblk; + + ASSERT_SEGLOCK(sp->fs); + /* + * If full, finish this segment. We may be doing I/O, so + * release and reacquire the splbio(). + */ +#ifdef DIAGNOSTIC + if (sp->vp == NULL) + panic ("lfs_gatherblock: Null vp in segment"); +#endif + fs = sp->fs; + blksinblk = howmany(bp->b_bcount, fs->lfs_bsize); + if (sp->sum_bytes_left < sizeof(int32_t) * blksinblk || + sp->seg_bytes_left < bp->b_bcount) { + if (mptr) + mutex_exit(mptr); + lfs_updatemeta(sp); + + vers = sp->fip->fi_version; + (void) lfs_writeseg(fs, sp); + + /* Add the current file to the segment summary. */ + lfs_acquire_finfo(fs, VTOI(sp->vp)->i_number, vers); + + if (mptr) + mutex_enter(mptr); + return (1); + } + + if (bp->b_flags & B_GATHERED) { + DLOG((DLOG_SEG, "lfs_gatherblock: already gathered! Ino %d," + " lbn %" PRId64 "\n", + sp->fip->fi_ino, bp->b_lblkno)); + return (0); + } + + /* Insert into the buffer list, update the FINFO block. */ + bp->b_flags |= B_GATHERED; + + *sp->cbpp++ = bp; + for (j = 0; j < blksinblk; j++) { + sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno + j; + /* This block's accounting moves from lfs_favail to lfs_avail */ + lfs_deregister_block(sp->vp, bp->b_lblkno + j); + } + + sp->sum_bytes_left -= sizeof(int32_t) * blksinblk; + sp->seg_bytes_left -= bp->b_bcount; + return (0); +} + +int +lfs_gather(struct lfs *fs, struct segment *sp, struct vnode *vp, + int (*match)(struct lfs *, struct buf *)) +{ + struct buf *bp, *nbp; + int count = 0; + + ASSERT_SEGLOCK(fs); + if (vp->v_type == VBLK) + return 0; + KASSERT(sp->vp == NULL); + sp->vp = vp; + mutex_enter(&bufcache_lock); + +#ifndef LFS_NO_BACKBUF_HACK +/* This is a hack to see if ordering the blocks in LFS makes a difference. */ +# define BUF_OFFSET \ + (((char *)&LIST_NEXT(bp, b_vnbufs)) - (char *)bp) +# define BACK_BUF(BP) \ + ((struct buf *)(((char *)(BP)->b_vnbufs.le_prev) - BUF_OFFSET)) +# define BEG_OF_LIST \ + ((struct buf *)(((char *)&LIST_FIRST(&vp->v_dirtyblkhd)) - BUF_OFFSET)) + +loop: + /* Find last buffer. */ + for (bp = LIST_FIRST(&vp->v_dirtyblkhd); + bp && LIST_NEXT(bp, b_vnbufs) != NULL; + bp = LIST_NEXT(bp, b_vnbufs)) + /* nothing */; + for (; bp && bp != BEG_OF_LIST; bp = nbp) { + nbp = BACK_BUF(bp); +#else /* LFS_NO_BACKBUF_HACK */ +loop: + for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); +#endif /* LFS_NO_BACKBUF_HACK */ + if ((bp->b_cflags & BC_BUSY) != 0 || + (bp->b_flags & B_GATHERED) != 0 || !match(fs, bp)) { +#ifdef DEBUG + if (vp == fs->lfs_ivnode && + (bp->b_cflags & BC_BUSY) != 0 && + (bp->b_flags & B_GATHERED) == 0) + log(LOG_NOTICE, "lfs_gather: ifile lbn %" + PRId64 " busy (%x) at 0x%x", + bp->b_lblkno, bp->b_flags, + (unsigned)fs->lfs_offset); +#endif + continue; + } +#ifdef DIAGNOSTIC +# ifdef LFS_USE_B_INVAL + if ((bp->b_flags & BC_INVAL) != 0 && bp->b_iodone == NULL) { + DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64 + " is BC_INVAL\n", bp->b_lblkno)); + VOP_PRINT(bp->b_vp); + } +# endif /* LFS_USE_B_INVAL */ + if (!(bp->b_oflags & BO_DELWRI)) + panic("lfs_gather: bp not BO_DELWRI"); + if (!(bp->b_flags & B_LOCKED)) { + DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64 + " blk %" PRId64 " not B_LOCKED\n", + bp->b_lblkno, + dbtofsb(fs, bp->b_blkno))); + VOP_PRINT(bp->b_vp); + panic("lfs_gather: bp not B_LOCKED"); + } +#endif + if (lfs_gatherblock(sp, bp, &bufcache_lock)) { + goto loop; + } + count++; + } + mutex_exit(&bufcache_lock); + lfs_updatemeta(sp); + KASSERT(sp->vp == vp); + sp->vp = NULL; + return count; +} + +#if DEBUG +# define DEBUG_OOFF(n) do { \ + if (ooff == 0) { \ + DLOG((DLOG_SEG, "lfs_updatemeta[%d]: warning: writing " \ + "ino %d lbn %" PRId64 " at 0x%" PRIx32 \ + ", was 0x0 (or %" PRId64 ")\n", \ + (n), ip->i_number, lbn, ndaddr, daddr)); \ + } \ +} while (0) +#else +# define DEBUG_OOFF(n) +#endif + +/* + * Change the given block's address to ndaddr, finding its previous + * location using ufs_bmaparray(). + * + * Account for this change in the segment table. + * + * called with sp == NULL by roll-forwarding code. + */ +void +lfs_update_single(struct lfs *fs, struct segment *sp, + struct vnode *vp, daddr_t lbn, int32_t ndaddr, int size) +{ + SEGUSE *sup; + struct buf *bp; + struct indir a[NIADDR + 2], *ap; + struct inode *ip; + daddr_t daddr, ooff; + int num, error; + int bb, osize, obb; + + ASSERT_SEGLOCK(fs); + KASSERT(sp == NULL || sp->vp == vp); + ip = VTOI(vp); + + error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL, NULL); + if (error) + panic("lfs_updatemeta: ufs_bmaparray returned %d", error); + + daddr = (daddr_t)((int32_t)daddr); /* XXX ondisk32 */ + KASSERT(daddr <= LFS_MAX_DADDR); + if (daddr > 0) + daddr = dbtofsb(fs, daddr); + + bb = numfrags(fs, size); + switch (num) { + case 0: + ooff = ip->i_ffs1_db[lbn]; + DEBUG_OOFF(0); + if (ooff == UNWRITTEN) + ip->i_ffs1_blocks += bb; + else { + /* possible fragment truncation or extension */ + obb = btofsb(fs, ip->i_lfs_fragsize[lbn]); + ip->i_ffs1_blocks += (bb - obb); + } + ip->i_ffs1_db[lbn] = ndaddr; + break; + case 1: + ooff = ip->i_ffs1_ib[a[0].in_off]; + DEBUG_OOFF(1); + if (ooff == UNWRITTEN) + ip->i_ffs1_blocks += bb; + ip->i_ffs1_ib[a[0].in_off] = ndaddr; + break; + default: + ap = &a[num - 1]; + if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, + B_MODIFY, &bp)) + panic("lfs_updatemeta: bread bno %" PRId64, + ap->in_lbn); + + /* XXX ondisk32 */ + ooff = ((int32_t *)bp->b_data)[ap->in_off]; + DEBUG_OOFF(num); + if (ooff == UNWRITTEN) + ip->i_ffs1_blocks += bb; + /* XXX ondisk32 */ + ((int32_t *)bp->b_data)[ap->in_off] = ndaddr; + (void) VOP_BWRITE(bp->b_vp, bp); + } + + KASSERT(ooff == 0 || ooff == UNWRITTEN || ooff == daddr); + + /* Update hiblk when extending the file */ + if (lbn > ip->i_lfs_hiblk) + ip->i_lfs_hiblk = lbn; + + /* + * Though we'd rather it couldn't, this *can* happen right now + * if cleaning blocks and regular blocks coexist. + */ + /* KASSERT(daddr < fs->lfs_lastpseg || daddr > ndaddr); */ + + /* + * Update segment usage information, based on old size + * and location. + */ + if (daddr > 0) { + u_int32_t oldsn = dtosn(fs, daddr); +#ifdef DIAGNOSTIC + int ndupino; + + if (sp && sp->seg_number == oldsn) { + ndupino = sp->ndupino; + } else { + ndupino = 0; + } +#endif + KASSERT(oldsn < fs->lfs_nseg); + if (lbn >= 0 && lbn < NDADDR) + osize = ip->i_lfs_fragsize[lbn]; + else + osize = fs->lfs_bsize; + LFS_SEGENTRY(sup, fs, oldsn, bp); +#ifdef DIAGNOSTIC + if (sup->su_nbytes + sizeof (struct ufs1_dinode) * ndupino + < osize) { + printf("lfs_updatemeta: negative bytes " + "(segment %" PRIu32 " short by %" PRId64 + ")\n", dtosn(fs, daddr), + (int64_t)osize - + (sizeof (struct ufs1_dinode) * ndupino + + sup->su_nbytes)); + printf("lfs_updatemeta: ino %llu, lbn %" PRId64 + ", addr = 0x%" PRIx64 "\n", + (unsigned long long)ip->i_number, lbn, daddr); + printf("lfs_updatemeta: ndupino=%d\n", ndupino); + panic("lfs_updatemeta: negative bytes"); + sup->su_nbytes = osize - + sizeof (struct ufs1_dinode) * ndupino; + } +#endif + DLOG((DLOG_SU, "seg %" PRIu32 " -= %d for ino %d lbn %" PRId64 + " db 0x%" PRIx64 "\n", + dtosn(fs, daddr), osize, + ip->i_number, lbn, daddr)); + sup->su_nbytes -= osize; + if (!(bp->b_flags & B_GATHERED)) { + mutex_enter(&lfs_lock); + fs->lfs_flags |= LFS_IFDIRTY; + mutex_exit(&lfs_lock); + } + LFS_WRITESEGENTRY(sup, fs, oldsn, bp); + } + /* + * Now that this block has a new address, and its old + * segment no longer owns it, we can forget about its + * old size. + */ + if (lbn >= 0 && lbn < NDADDR) + ip->i_lfs_fragsize[lbn] = size; +} + +/* + * Update the metadata that points to the blocks listed in the FINFO + * array. + */ +void +lfs_updatemeta(struct segment *sp) +{ + struct buf *sbp; + struct lfs *fs; + struct vnode *vp; + daddr_t lbn; + int i, nblocks, num; + int bb; + int bytesleft, size; + + ASSERT_SEGLOCK(sp->fs); + vp = sp->vp; + nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp; + KASSERT(nblocks >= 0); + KASSERT(vp != NULL); + if (nblocks == 0) + return; + + /* + * This count may be high due to oversize blocks from lfs_gop_write. + * Correct for this. (XXX we should be able to keep track of these.) + */ + fs = sp->fs; + for (i = 0; i < nblocks; i++) { + if (sp->start_bpp[i] == NULL) { + DLOG((DLOG_SEG, "lfs_updatemeta: nblocks = %d, not %d\n", i, nblocks)); + nblocks = i; + break; + } + num = howmany(sp->start_bpp[i]->b_bcount, fs->lfs_bsize); + KASSERT(sp->start_bpp[i]->b_lblkno >= 0 || num == 1); + nblocks -= num - 1; + } + + KASSERT(vp->v_type == VREG || + nblocks == &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp); + KASSERT(nblocks == sp->cbpp - sp->start_bpp); + + /* + * Sort the blocks. + * + * We have to sort even if the blocks come from the + * cleaner, because there might be other pending blocks on the + * same inode...and if we don't sort, and there are fragments + * present, blocks may be written in the wrong place. + */ + lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks, fs->lfs_bsize); + + /* + * Record the length of the last block in case it's a fragment. + * If there are indirect blocks present, they sort last. An + * indirect block will be lfs_bsize and its presence indicates + * that you cannot have fragments. + * + * XXX This last is a lie. A cleaned fragment can coexist with + * XXX a later indirect block. This will continue to be + * XXX true until lfs_markv is fixed to do everything with + * XXX fake blocks (including fake inodes and fake indirect blocks). + */ + sp->fip->fi_lastlength = ((sp->start_bpp[nblocks - 1]->b_bcount - 1) & + fs->lfs_bmask) + 1; + + /* + * Assign disk addresses, and update references to the logical + * block and the segment usage information. + */ + for (i = nblocks; i--; ++sp->start_bpp) { + sbp = *sp->start_bpp; + lbn = *sp->start_lbp; + KASSERT(sbp->b_lblkno == lbn); + + sbp->b_blkno = fsbtodb(fs, fs->lfs_offset); + + /* + * If we write a frag in the wrong place, the cleaner won't + * be able to correctly identify its size later, and the + * segment will be uncleanable. (Even worse, it will assume + * that the indirect block that actually ends the list + * is of a smaller size!) + */ + if ((sbp->b_bcount & fs->lfs_bmask) && i != 0) + panic("lfs_updatemeta: fragment is not last block"); + + /* + * For each subblock in this possibly oversized block, + * update its address on disk. + */ + KASSERT(lbn >= 0 || sbp->b_bcount == fs->lfs_bsize); + KASSERT(vp == sbp->b_vp); + for (bytesleft = sbp->b_bcount; bytesleft > 0; + bytesleft -= fs->lfs_bsize) { + size = MIN(bytesleft, fs->lfs_bsize); + bb = numfrags(fs, size); + lbn = *sp->start_lbp++; + lfs_update_single(fs, sp, sp->vp, lbn, fs->lfs_offset, + size); + fs->lfs_offset += bb; + } + + } + + /* This inode has been modified */ + LFS_SET_UINO(VTOI(vp), IN_MODIFIED); +} + +/* + * Move lfs_offset to a segment earlier than sn. + */ +int +lfs_rewind(struct lfs *fs, int newsn) +{ + int sn, osn, isdirty; + struct buf *bp; + SEGUSE *sup; + + ASSERT_SEGLOCK(fs); + + osn = dtosn(fs, fs->lfs_offset); + if (osn < newsn) + return 0; + + /* lfs_avail eats the remaining space in this segment */ + fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset - fs->lfs_curseg); + + /* Find a low-numbered segment */ + for (sn = 0; sn < fs->lfs_nseg; ++sn) { + LFS_SEGENTRY(sup, fs, sn, bp); + isdirty = sup->su_flags & SEGUSE_DIRTY; + brelse(bp, 0); + + if (!isdirty) + break; + } + if (sn == fs->lfs_nseg) + panic("lfs_rewind: no clean segments"); + if (newsn >= 0 && sn >= newsn) + return ENOENT; + fs->lfs_nextseg = sn; + lfs_newseg(fs); + fs->lfs_offset = fs->lfs_curseg; + + return 0; +} + +/* + * Start a new partial segment. + * + * Return 1 when we entered to a new segment. + * Otherwise, return 0. + */ +int +lfs_initseg(struct lfs *fs) +{ + struct segment *sp = fs->lfs_sp; + SEGSUM *ssp; + struct buf *sbp; /* buffer for SEGSUM */ + int repeat = 0; /* return value */ + + ASSERT_SEGLOCK(fs); + /* Advance to the next segment. */ + if (!LFS_PARTIAL_FITS(fs)) { + SEGUSE *sup; + struct buf *bp; + + /* lfs_avail eats the remaining space */ + fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset - + fs->lfs_curseg); + /* Wake up any cleaning procs waiting on this file system. */ + lfs_wakeup_cleaner(fs); + lfs_newseg(fs); + repeat = 1; + fs->lfs_offset = fs->lfs_curseg; + + sp->seg_number = dtosn(fs, fs->lfs_curseg); + sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg); + + /* + * If the segment contains a superblock, update the offset + * and summary address to skip over it. + */ + LFS_SEGENTRY(sup, fs, sp->seg_number, bp); + if (sup->su_flags & SEGUSE_SUPERBLOCK) { + fs->lfs_offset += btofsb(fs, LFS_SBPAD); + sp->seg_bytes_left -= LFS_SBPAD; + } + brelse(bp, 0); + /* Segment zero could also contain the labelpad */ + if (fs->lfs_version > 1 && sp->seg_number == 0 && + fs->lfs_start < btofsb(fs, LFS_LABELPAD)) { + fs->lfs_offset += + btofsb(fs, LFS_LABELPAD) - fs->lfs_start; + sp->seg_bytes_left -= + LFS_LABELPAD - fsbtob(fs, fs->lfs_start); + } + } else { + sp->seg_number = dtosn(fs, fs->lfs_curseg); + sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg - + (fs->lfs_offset - fs->lfs_curseg)); + } + fs->lfs_lastpseg = fs->lfs_offset; + + /* Record first address of this partial segment */ + if (sp->seg_flags & SEGM_CLEAN) { + fs->lfs_cleanint[fs->lfs_cleanind] = fs->lfs_offset; + if (++fs->lfs_cleanind >= LFS_MAX_CLEANIND) { + /* "1" is the artificial inc in lfs_seglock */ + mutex_enter(&lfs_lock); + while (fs->lfs_iocount > 1) { + mtsleep(&fs->lfs_iocount, PRIBIO + 1, + "lfs_initseg", 0, &lfs_lock); + } + mutex_exit(&lfs_lock); + fs->lfs_cleanind = 0; + } + } + + sp->fs = fs; + sp->ibp = NULL; + sp->idp = NULL; + sp->ninodes = 0; + sp->ndupino = 0; + + sp->cbpp = sp->bpp; + + /* Get a new buffer for SEGSUM */ + sbp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp, + fsbtodb(fs, fs->lfs_offset), fs->lfs_sumsize, LFS_NB_SUMMARY); + + /* ... and enter it into the buffer list. */ + *sp->cbpp = sbp; + sp->cbpp++; + fs->lfs_offset += btofsb(fs, fs->lfs_sumsize); + + sp->start_bpp = sp->cbpp; + + /* Set point to SEGSUM, initialize it. */ + ssp = sp->segsum = sbp->b_data; + memset(ssp, 0, fs->lfs_sumsize); + ssp->ss_next = fs->lfs_nextseg; + ssp->ss_nfinfo = ssp->ss_ninos = 0; + ssp->ss_magic = SS_MAGIC; + + /* Set pointer to first FINFO, initialize it. */ + sp->fip = (struct finfo *)((char *)sp->segsum + SEGSUM_SIZE(fs)); + sp->fip->fi_nblocks = 0; + sp->start_lbp = &sp->fip->fi_blocks[0]; + sp->fip->fi_lastlength = 0; + + sp->seg_bytes_left -= fs->lfs_sumsize; + sp->sum_bytes_left = fs->lfs_sumsize - SEGSUM_SIZE(fs); + + return (repeat); +} + +/* + * Remove SEGUSE_INVAL from all segments. + */ +void +lfs_unset_inval_all(struct lfs *fs) +{ + SEGUSE *sup; + struct buf *bp; + int i; + + for (i = 0; i < fs->lfs_nseg; i++) { + LFS_SEGENTRY(sup, fs, i, bp); + if (sup->su_flags & SEGUSE_INVAL) { + sup->su_flags &= ~SEGUSE_INVAL; + LFS_WRITESEGENTRY(sup, fs, i, bp); + } else + brelse(bp, 0); + } +} + +/* + * Return the next segment to write. + */ +void +lfs_newseg(struct lfs *fs) +{ + CLEANERINFO *cip; + SEGUSE *sup; + struct buf *bp; + int curseg, isdirty, sn, skip_inval; + + ASSERT_SEGLOCK(fs); + + /* Honor LFCNWRAPSTOP */ + mutex_enter(&lfs_lock); + while (fs->lfs_nextseg < fs->lfs_curseg && fs->lfs_nowrap) { + if (fs->lfs_wrappass) { + log(LOG_NOTICE, "%s: wrappass=%d\n", + fs->lfs_fsmnt, fs->lfs_wrappass); + fs->lfs_wrappass = 0; + break; + } + fs->lfs_wrapstatus = LFS_WRAP_WAITING; + wakeup(&fs->lfs_nowrap); + log(LOG_NOTICE, "%s: waiting at log wrap\n", fs->lfs_fsmnt); + mtsleep(&fs->lfs_wrappass, PVFS, "newseg", 10 * hz, + &lfs_lock); + } + fs->lfs_wrapstatus = LFS_WRAP_GOING; + mutex_exit(&lfs_lock); + + LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_nextseg), bp); + DLOG((DLOG_SU, "lfs_newseg: seg %d := 0 in newseg\n", + dtosn(fs, fs->lfs_nextseg))); + sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; + sup->su_nbytes = 0; + sup->su_nsums = 0; + sup->su_ninos = 0; + LFS_WRITESEGENTRY(sup, fs, dtosn(fs, fs->lfs_nextseg), bp); + + LFS_CLEANERINFO(cip, fs, bp); + --cip->clean; + ++cip->dirty; + fs->lfs_nclean = cip->clean; + LFS_SYNC_CLEANERINFO(cip, fs, bp, 1); + + fs->lfs_lastseg = fs->lfs_curseg; + fs->lfs_curseg = fs->lfs_nextseg; + skip_inval = 1; + for (sn = curseg = dtosn(fs, fs->lfs_curseg) + fs->lfs_interleave;;) { + sn = (sn + 1) % fs->lfs_nseg; + + if (sn == curseg) { + if (skip_inval) + skip_inval = 0; + else + panic("lfs_nextseg: no clean segments"); + } + LFS_SEGENTRY(sup, fs, sn, bp); + isdirty = sup->su_flags & (SEGUSE_DIRTY | (skip_inval ? SEGUSE_INVAL : 0)); + /* Check SEGUSE_EMPTY as we go along */ + if (isdirty && sup->su_nbytes == 0 && + !(sup->su_flags & SEGUSE_EMPTY)) + LFS_WRITESEGENTRY(sup, fs, sn, bp); + else + brelse(bp, 0); + + if (!isdirty) + break; + } + if (skip_inval == 0) + lfs_unset_inval_all(fs); + + ++fs->lfs_nactive; + fs->lfs_nextseg = sntod(fs, sn); + if (lfs_dostats) { + ++lfs_stats.segsused; + } +} + +static struct buf * +lfs_newclusterbuf(struct lfs *fs, struct vnode *vp, daddr_t addr, + int n) +{ + struct lfs_cluster *cl; + struct buf **bpp, *bp; + + ASSERT_SEGLOCK(fs); + cl = (struct lfs_cluster *)pool_get(&fs->lfs_clpool, PR_WAITOK); + bpp = (struct buf **)pool_get(&fs->lfs_bpppool, PR_WAITOK); + memset(cl, 0, sizeof(*cl)); + cl->fs = fs; + cl->bpp = bpp; + cl->bufcount = 0; + cl->bufsize = 0; + + /* If this segment is being written synchronously, note that */ + if (fs->lfs_sp->seg_flags & SEGM_SYNC) { + cl->flags |= LFS_CL_SYNC; + cl->seg = fs->lfs_sp; + ++cl->seg->seg_iocount; + } + + /* Get an empty buffer header, or maybe one with something on it */ + bp = getiobuf(vp, true); + bp->b_dev = NODEV; + bp->b_blkno = bp->b_lblkno = addr; + bp->b_iodone = lfs_cluster_callback; + bp->b_private = cl; + + return bp; +} + +int +lfs_writeseg(struct lfs *fs, struct segment *sp) +{ + struct buf **bpp, *bp, *cbp, *newbp, *unbusybp; + SEGUSE *sup; + SEGSUM *ssp; + int i; + int do_again, nblocks, byteoffset; + size_t el_size; + struct lfs_cluster *cl; + u_short ninos; + struct vnode *devvp; + char *p = NULL; + struct vnode *vp; + int32_t *daddrp; /* XXX ondisk32 */ + int changed; + u_int32_t sum; +#ifdef DEBUG + FINFO *fip; + int findex; +#endif + + ASSERT_SEGLOCK(fs); + + ssp = (SEGSUM *)sp->segsum; + + /* + * If there are no buffers other than the segment summary to write, + * don't do anything. If we are the end of a dirop sequence, however, + * write the empty segment summary anyway, to help out the + * roll-forward agent. + */ + if ((nblocks = sp->cbpp - sp->bpp) == 1) { + if ((ssp->ss_flags & (SS_DIROP | SS_CONT)) != SS_DIROP) + return 0; + } + + /* Note if partial segment is being written by the cleaner */ + if (sp->seg_flags & SEGM_CLEAN) + ssp->ss_flags |= SS_CLEAN; + + devvp = VTOI(fs->lfs_ivnode)->i_devvp; + + /* Update the segment usage information. */ + LFS_SEGENTRY(sup, fs, sp->seg_number, bp); + + /* Loop through all blocks, except the segment summary. */ + for (bpp = sp->bpp; ++bpp < sp->cbpp; ) { + if ((*bpp)->b_vp != devvp) { + sup->su_nbytes += (*bpp)->b_bcount; + DLOG((DLOG_SU, "seg %" PRIu32 " += %ld for ino %d" + " lbn %" PRId64 " db 0x%" PRIx64 "\n", + sp->seg_number, (*bpp)->b_bcount, + VTOI((*bpp)->b_vp)->i_number, (*bpp)->b_lblkno, + (*bpp)->b_blkno)); + } + } + +#ifdef DEBUG + /* Check for zero-length and zero-version FINFO entries. */ + fip = (struct finfo *)((char *)ssp + SEGSUM_SIZE(fs)); + for (findex = 0; findex < ssp->ss_nfinfo; findex++) { + KDASSERT(fip->fi_nblocks > 0); + KDASSERT(fip->fi_version > 0); + fip = (FINFO *)((char *)fip + FINFOSIZE + + sizeof(int32_t) * fip->fi_nblocks); + } +#endif /* DEBUG */ + + ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs); + DLOG((DLOG_SU, "seg %d += %d for %d inodes\n", + sp->seg_number, ssp->ss_ninos * sizeof (struct ufs1_dinode), + ssp->ss_ninos)); + sup->su_nbytes += ssp->ss_ninos * sizeof (struct ufs1_dinode); + /* sup->su_nbytes += fs->lfs_sumsize; */ + if (fs->lfs_version == 1) + sup->su_olastmod = time_second; + else + sup->su_lastmod = time_second; + sup->su_ninos += ninos; + ++sup->su_nsums; + fs->lfs_avail -= btofsb(fs, fs->lfs_sumsize); + + do_again = !(bp->b_flags & B_GATHERED); + LFS_WRITESEGENTRY(sup, fs, sp->seg_number, bp); /* Ifile */ + + /* + * Mark blocks B_BUSY, to prevent then from being changed between + * the checksum computation and the actual write. + * + * If we are cleaning, check indirect blocks for UNWRITTEN, and if + * there are any, replace them with copies that have UNASSIGNED + * instead. + */ + mutex_enter(&bufcache_lock); + for (bpp = sp->bpp, i = nblocks - 1; i--;) { + ++bpp; + bp = *bpp; + if (bp->b_iodone != NULL) { /* UBC or malloced buffer */ + bp->b_cflags |= BC_BUSY; + continue; + } + + while (bp->b_cflags & BC_BUSY) { + DLOG((DLOG_SEG, "lfs_writeseg: avoiding potential" + " data summary corruption for ino %d, lbn %" + PRId64 "\n", + VTOI(bp->b_vp)->i_number, bp->b_lblkno)); + bp->b_cflags |= BC_WANTED; + cv_wait(&bp->b_busy, &bufcache_lock); + } + bp->b_cflags |= BC_BUSY; + mutex_exit(&bufcache_lock); + unbusybp = NULL; + + /* + * Check and replace indirect block UNWRITTEN bogosity. + * XXX See comment in lfs_writefile. + */ + if (bp->b_lblkno < 0 && bp->b_vp != devvp && bp->b_vp && + VTOI(bp->b_vp)->i_ffs1_blocks != + VTOI(bp->b_vp)->i_lfs_effnblks) { + DLOG((DLOG_VNODE, "lfs_writeseg: cleansing ino %d (%d != %d)\n", + VTOI(bp->b_vp)->i_number, + VTOI(bp->b_vp)->i_lfs_effnblks, + VTOI(bp->b_vp)->i_ffs1_blocks)); + /* Make a copy we'll make changes to */ + newbp = lfs_newbuf(fs, bp->b_vp, bp->b_lblkno, + bp->b_bcount, LFS_NB_IBLOCK); + newbp->b_blkno = bp->b_blkno; + memcpy(newbp->b_data, bp->b_data, + newbp->b_bcount); + + changed = 0; + /* XXX ondisk32 */ + for (daddrp = (int32_t *)(newbp->b_data); + daddrp < (int32_t *)((char *)newbp->b_data + + newbp->b_bcount); daddrp++) { + if (*daddrp == UNWRITTEN) { + ++changed; + *daddrp = 0; + } + } + /* + * Get rid of the old buffer. Don't mark it clean, + * though, if it still has dirty data on it. + */ + if (changed) { + DLOG((DLOG_SEG, "lfs_writeseg: replacing UNWRITTEN(%d):" + " bp = %p newbp = %p\n", changed, bp, + newbp)); + *bpp = newbp; + bp->b_flags &= ~B_GATHERED; + bp->b_error = 0; + if (bp->b_iodone != NULL) { + DLOG((DLOG_SEG, "lfs_writeseg: " + "indir bp should not be B_CALL\n")); + biodone(bp); + bp = NULL; + } else { + /* Still on free list, leave it there */ + unbusybp = bp; + /* + * We have to re-decrement lfs_avail + * since this block is going to come + * back around to us in the next + * segment. + */ + fs->lfs_avail -= + btofsb(fs, bp->b_bcount); + } + } else { + lfs_freebuf(fs, newbp); + } + } + mutex_enter(&bufcache_lock); + if (unbusybp != NULL) { + unbusybp->b_cflags &= ~BC_BUSY; + if (unbusybp->b_cflags & BC_WANTED) + cv_broadcast(&bp->b_busy); + } + } + mutex_exit(&bufcache_lock); + + /* + * Compute checksum across data and then across summary; the first + * block (the summary block) is skipped. Set the create time here + * so that it's guaranteed to be later than the inode mod times. + */ + sum = 0; + if (fs->lfs_version == 1) + el_size = sizeof(u_long); + else + el_size = sizeof(u_int32_t); + for (bpp = sp->bpp, i = nblocks - 1; i--; ) { + ++bpp; + /* Loop through gop_write cluster blocks */ + for (byteoffset = 0; byteoffset < (*bpp)->b_bcount; + byteoffset += fs->lfs_bsize) { +#ifdef LFS_USE_B_INVAL + if (((*bpp)->b_cflags & BC_INVAL) != 0 && + (*bpp)->b_iodone != NULL) { + if (copyin((void *)(*bpp)->b_saveaddr + + byteoffset, dp, el_size)) { + panic("lfs_writeseg: copyin failed [1]:" + " ino %d blk %" PRId64, + VTOI((*bpp)->b_vp)->i_number, + (*bpp)->b_lblkno); + } + } else +#endif /* LFS_USE_B_INVAL */ + { + sum = lfs_cksum_part((char *) + (*bpp)->b_data + byteoffset, el_size, sum); + } + } + } + if (fs->lfs_version == 1) + ssp->ss_ocreate = time_second; + else { + ssp->ss_create = time_second; + ssp->ss_serial = ++fs->lfs_serial; + ssp->ss_ident = fs->lfs_ident; + } + ssp->ss_datasum = lfs_cksum_fold(sum); + ssp->ss_sumsum = cksum(&ssp->ss_datasum, + fs->lfs_sumsize - sizeof(ssp->ss_sumsum)); + + mutex_enter(&lfs_lock); + fs->lfs_bfree -= (btofsb(fs, ninos * fs->lfs_ibsize) + + btofsb(fs, fs->lfs_sumsize)); + fs->lfs_dmeta += (btofsb(fs, ninos * fs->lfs_ibsize) + + btofsb(fs, fs->lfs_sumsize)); + mutex_exit(&lfs_lock); + + /* + * When we simply write the blocks we lose a rotation for every block + * written. To avoid this problem, we cluster the buffers into a + * chunk and write the chunk. MAXPHYS is the largest size I/O + * devices can handle, use that for the size of the chunks. + * + * Blocks that are already clusters (from GOP_WRITE), however, we + * don't bother to copy into other clusters. + */ + +#define CHUNKSIZE MAXPHYS + + if (devvp == NULL) + panic("devvp is NULL"); + for (bpp = sp->bpp, i = nblocks; i;) { + cbp = lfs_newclusterbuf(fs, devvp, (*bpp)->b_blkno, i); + cl = cbp->b_private; + + cbp->b_flags |= B_ASYNC; + cbp->b_cflags |= BC_BUSY; + cbp->b_bcount = 0; + +#if defined(DEBUG) && defined(DIAGNOSTIC) + if (bpp - sp->bpp > (fs->lfs_sumsize - SEGSUM_SIZE(fs)) + / sizeof(int32_t)) { + panic("lfs_writeseg: real bpp overwrite"); + } + if (bpp - sp->bpp > segsize(fs) / fs->lfs_fsize) { + panic("lfs_writeseg: theoretical bpp overwrite"); + } +#endif + + /* + * Construct the cluster. + */ + mutex_enter(&lfs_lock); + ++fs->lfs_iocount; + mutex_exit(&lfs_lock); + while (i && cbp->b_bcount < CHUNKSIZE) { + bp = *bpp; + + if (bp->b_bcount > (CHUNKSIZE - cbp->b_bcount)) + break; + if (cbp->b_bcount > 0 && !(cl->flags & LFS_CL_MALLOC)) + break; + + /* Clusters from GOP_WRITE are expedited */ + if (bp->b_bcount > fs->lfs_bsize) { + if (cbp->b_bcount > 0) + /* Put in its own buffer */ + break; + else { + cbp->b_data = bp->b_data; + } + } else if (cbp->b_bcount == 0) { + p = cbp->b_data = lfs_malloc(fs, CHUNKSIZE, + LFS_NB_CLUSTER); + cl->flags |= LFS_CL_MALLOC; + } +#ifdef DIAGNOSTIC + if (dtosn(fs, dbtofsb(fs, bp->b_blkno + + btodb(bp->b_bcount - 1))) != + sp->seg_number) { + printf("blk size %d daddr %" PRIx64 + " not in seg %d\n", + bp->b_bcount, bp->b_blkno, + sp->seg_number); + panic("segment overwrite"); + } +#endif + +#ifdef LFS_USE_B_INVAL + /* + * Fake buffers from the cleaner are marked as B_INVAL. + * We need to copy the data from user space rather than + * from the buffer indicated. + * XXX == what do I do on an error? + */ + if ((bp->b_cflags & BC_INVAL) != 0 && + bp->b_iodone != NULL) { + if (copyin(bp->b_saveaddr, p, bp->b_bcount)) + panic("lfs_writeseg: " + "copyin failed [2]"); + } else +#endif /* LFS_USE_B_INVAL */ + if (cl->flags & LFS_CL_MALLOC) { + /* copy data into our cluster. */ + memcpy(p, bp->b_data, bp->b_bcount); + p += bp->b_bcount; + } + + cbp->b_bcount += bp->b_bcount; + cl->bufsize += bp->b_bcount; + + bp->b_flags &= ~B_READ; + bp->b_error = 0; + cl->bpp[cl->bufcount++] = bp; + + vp = bp->b_vp; + mutex_enter(&bufcache_lock); + mutex_enter(vp->v_interlock); + bp->b_oflags &= ~(BO_DELWRI | BO_DONE); + reassignbuf(bp, vp); + vp->v_numoutput++; + mutex_exit(vp->v_interlock); + mutex_exit(&bufcache_lock); + + bpp++; + i--; + } + if (fs->lfs_sp->seg_flags & SEGM_SYNC) + BIO_SETPRIO(cbp, BPRIO_TIMECRITICAL); + else + BIO_SETPRIO(cbp, BPRIO_TIMELIMITED); + mutex_enter(devvp->v_interlock); + devvp->v_numoutput++; + mutex_exit(devvp->v_interlock); + VOP_STRATEGY(devvp, cbp); + curlwp->l_ru.ru_oublock++; + } + + if (lfs_dostats) { + ++lfs_stats.psegwrites; + lfs_stats.blocktot += nblocks - 1; + if (fs->lfs_sp->seg_flags & SEGM_SYNC) + ++lfs_stats.psyncwrites; + if (fs->lfs_sp->seg_flags & SEGM_CLEAN) { + ++lfs_stats.pcleanwrites; + lfs_stats.cleanblocks += nblocks - 1; + } + } + + return (lfs_initseg(fs) || do_again); +} + +void +lfs_writesuper(struct lfs *fs, daddr_t daddr) +{ + struct buf *bp; + struct vnode *devvp = VTOI(fs->lfs_ivnode)->i_devvp; + int s; + + ASSERT_MAYBE_SEGLOCK(fs); +#ifdef DIAGNOSTIC + KASSERT(fs->lfs_magic == LFS_MAGIC); +#endif + /* + * If we can write one superblock while another is in + * progress, we risk not having a complete checkpoint if we crash. + * So, block here if a superblock write is in progress. + */ + mutex_enter(&lfs_lock); + s = splbio(); + while (fs->lfs_sbactive) { + mtsleep(&fs->lfs_sbactive, PRIBIO+1, "lfs sb", 0, + &lfs_lock); + } + fs->lfs_sbactive = daddr; + splx(s); + mutex_exit(&lfs_lock); + + /* Set timestamp of this version of the superblock */ + if (fs->lfs_version == 1) + fs->lfs_otstamp = time_second; + fs->lfs_tstamp = time_second; + + /* Checksum the superblock and copy it into a buffer. */ + fs->lfs_cksum = lfs_sb_cksum(&(fs->lfs_dlfs)); + bp = lfs_newbuf(fs, devvp, + fsbtodb(fs, daddr), LFS_SBPAD, LFS_NB_SBLOCK); + memset((char *)bp->b_data + sizeof(struct dlfs), 0, + LFS_SBPAD - sizeof(struct dlfs)); + *(struct dlfs *)bp->b_data = fs->lfs_dlfs; + + bp->b_cflags |= BC_BUSY; + bp->b_flags = (bp->b_flags & ~B_READ) | B_ASYNC; + bp->b_oflags &= ~(BO_DONE | BO_DELWRI); + bp->b_error = 0; + bp->b_iodone = lfs_supercallback; + + if (fs->lfs_sp != NULL && fs->lfs_sp->seg_flags & SEGM_SYNC) + BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); + else + BIO_SETPRIO(bp, BPRIO_TIMELIMITED); + curlwp->l_ru.ru_oublock++; + + mutex_enter(devvp->v_interlock); + devvp->v_numoutput++; + mutex_exit(devvp->v_interlock); + + mutex_enter(&lfs_lock); + ++fs->lfs_iocount; + mutex_exit(&lfs_lock); + VOP_STRATEGY(devvp, bp); +} + +/* + * Logical block number match routines used when traversing the dirty block + * chain. + */ +int +lfs_match_fake(struct lfs *fs, struct buf *bp) +{ + + ASSERT_SEGLOCK(fs); + return LFS_IS_MALLOC_BUF(bp); +} + +#if 0 +int +lfs_match_real(struct lfs *fs, struct buf *bp) +{ + + ASSERT_SEGLOCK(fs); + return (lfs_match_data(fs, bp) && !lfs_match_fake(fs, bp)); +} +#endif + +int +lfs_match_data(struct lfs *fs, struct buf *bp) +{ + + ASSERT_SEGLOCK(fs); + return (bp->b_lblkno >= 0); +} + +int +lfs_match_indir(struct lfs *fs, struct buf *bp) +{ + daddr_t lbn; + + ASSERT_SEGLOCK(fs); + lbn = bp->b_lblkno; + return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0); +} + +int +lfs_match_dindir(struct lfs *fs, struct buf *bp) +{ + daddr_t lbn; + + ASSERT_SEGLOCK(fs); + lbn = bp->b_lblkno; + return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1); +} + +int +lfs_match_tindir(struct lfs *fs, struct buf *bp) +{ + daddr_t lbn; + + ASSERT_SEGLOCK(fs); + lbn = bp->b_lblkno; + return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2); +} + +static void +lfs_free_aiodone(struct buf *bp) +{ + struct lfs *fs; + + KERNEL_LOCK(1, curlwp); + fs = bp->b_private; + ASSERT_NO_SEGLOCK(fs); + lfs_freebuf(fs, bp); + KERNEL_UNLOCK_LAST(curlwp); +} + +static void +lfs_super_aiodone(struct buf *bp) +{ + struct lfs *fs; + + KERNEL_LOCK(1, curlwp); + fs = bp->b_private; + ASSERT_NO_SEGLOCK(fs); + mutex_enter(&lfs_lock); + fs->lfs_sbactive = 0; + if (--fs->lfs_iocount <= 1) + wakeup(&fs->lfs_iocount); + wakeup(&fs->lfs_sbactive); + mutex_exit(&lfs_lock); + lfs_freebuf(fs, bp); + KERNEL_UNLOCK_LAST(curlwp); +} + +static void +lfs_cluster_aiodone(struct buf *bp) +{ + struct lfs_cluster *cl; + struct lfs *fs; + struct buf *tbp, *fbp; + struct vnode *vp, *devvp, *ovp; + struct inode *ip; + int error; + + KERNEL_LOCK(1, curlwp); + + error = bp->b_error; + cl = bp->b_private; + fs = cl->fs; + devvp = VTOI(fs->lfs_ivnode)->i_devvp; + ASSERT_NO_SEGLOCK(fs); + + /* Put the pages back, and release the buffer */ + while (cl->bufcount--) { + tbp = cl->bpp[cl->bufcount]; + KASSERT(tbp->b_cflags & BC_BUSY); + if (error) { + tbp->b_error = error; + } + + /* + * We're done with tbp. If it has not been re-dirtied since + * the cluster was written, free it. Otherwise, keep it on + * the locked list to be written again. + */ + vp = tbp->b_vp; + + tbp->b_flags &= ~B_GATHERED; + + LFS_BCLEAN_LOG(fs, tbp); + + mutex_enter(&bufcache_lock); + if (tbp->b_iodone == NULL) { + KASSERT(tbp->b_flags & B_LOCKED); + bremfree(tbp); + if (vp) { + mutex_enter(vp->v_interlock); + reassignbuf(tbp, vp); + mutex_exit(vp->v_interlock); + } + tbp->b_flags |= B_ASYNC; /* for biodone */ + } + + if (((tbp->b_flags | tbp->b_oflags) & + (B_LOCKED | BO_DELWRI)) == B_LOCKED) + LFS_UNLOCK_BUF(tbp); + + if (tbp->b_oflags & BO_DONE) { + DLOG((DLOG_SEG, "blk %d biodone already (flags %lx)\n", + cl->bufcount, (long)tbp->b_flags)); + } + + if (tbp->b_iodone != NULL && !LFS_IS_MALLOC_BUF(tbp)) { + /* + * A buffer from the page daemon. + * We use the same iodone as it does, + * so we must manually disassociate its + * buffers from the vp. + */ + if ((ovp = tbp->b_vp) != NULL) { + /* This is just silly */ + mutex_enter(ovp->v_interlock); + brelvp(tbp); + mutex_exit(ovp->v_interlock); + tbp->b_vp = vp; + tbp->b_objlock = vp->v_interlock; + } + /* Put it back the way it was */ + tbp->b_flags |= B_ASYNC; + /* Master buffers have BC_AGE */ + if (tbp->b_private == tbp) + tbp->b_cflags |= BC_AGE; + } + mutex_exit(&bufcache_lock); + + biodone(tbp); + + /* + * If this is the last block for this vnode, but + * there are other blocks on its dirty list, + * set IN_MODIFIED/IN_CLEANING depending on what + * sort of block. Only do this for our mount point, + * not for, e.g., inode blocks that are attached to + * the devvp. + * XXX KS - Shouldn't we set *both* if both types + * of blocks are present (traverse the dirty list?) + */ + mutex_enter(&lfs_lock); + mutex_enter(vp->v_interlock); + if (vp != devvp && vp->v_numoutput == 0 && + (fbp = LIST_FIRST(&vp->v_dirtyblkhd)) != NULL) { + ip = VTOI(vp); + DLOG((DLOG_SEG, "lfs_cluster_aiodone: mark ino %d\n", + ip->i_number)); + if (LFS_IS_MALLOC_BUF(fbp)) + LFS_SET_UINO(ip, IN_CLEANING); + else + LFS_SET_UINO(ip, IN_MODIFIED); + } + cv_broadcast(&vp->v_cv); + mutex_exit(vp->v_interlock); + mutex_exit(&lfs_lock); + } + + /* Fix up the cluster buffer, and release it */ + if (cl->flags & LFS_CL_MALLOC) + lfs_free(fs, bp->b_data, LFS_NB_CLUSTER); + putiobuf(bp); + + /* Note i/o done */ + if (cl->flags & LFS_CL_SYNC) { + if (--cl->seg->seg_iocount == 0) + wakeup(&cl->seg->seg_iocount); + } + mutex_enter(&lfs_lock); +#ifdef DIAGNOSTIC + if (fs->lfs_iocount == 0) + panic("lfs_cluster_aiodone: zero iocount"); +#endif + if (--fs->lfs_iocount <= 1) + wakeup(&fs->lfs_iocount); + mutex_exit(&lfs_lock); + + KERNEL_UNLOCK_LAST(curlwp); + + pool_put(&fs->lfs_bpppool, cl->bpp); + cl->bpp = NULL; + pool_put(&fs->lfs_clpool, cl); +} + +static void +lfs_generic_callback(struct buf *bp, void (*aiodone)(struct buf *)) +{ + /* reset b_iodone for when this is a single-buf i/o. */ + bp->b_iodone = aiodone; + + workqueue_enqueue(uvm.aiodone_queue, &bp->b_work, NULL); +} + +static void +lfs_cluster_callback(struct buf *bp) +{ + + lfs_generic_callback(bp, lfs_cluster_aiodone); +} + +void +lfs_supercallback(struct buf *bp) +{ + + lfs_generic_callback(bp, lfs_super_aiodone); +} + +/* + * The only buffers that are going to hit these functions are the + * segment write blocks, or the segment summaries, or the superblocks. + * + * All of the above are created by lfs_newbuf, and so do not need to be + * released via brelse. + */ +void +lfs_callback(struct buf *bp) +{ + + lfs_generic_callback(bp, lfs_free_aiodone); +} + +/* + * Shellsort (diminishing increment sort) from Data Structures and + * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290; + * see also Knuth Vol. 3, page 84. The increments are selected from + * formula (8), page 95. Roughly O(N^3/2). + */ +/* + * This is our own private copy of shellsort because we want to sort + * two parallel arrays (the array of buffer pointers and the array of + * logical block numbers) simultaneously. Note that we cast the array + * of logical block numbers to a unsigned in this routine so that the + * negative block numbers (meta data blocks) sort AFTER the data blocks. + */ + +void +lfs_shellsort(struct buf **bp_array, int32_t *lb_array, int nmemb, int size) +{ + static int __rsshell_increments[] = { 4, 1, 0 }; + int incr, *incrp, t1, t2; + struct buf *bp_temp; + +#ifdef DEBUG + incr = 0; + for (t1 = 0; t1 < nmemb; t1++) { + for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) { + if (lb_array[incr++] != bp_array[t1]->b_lblkno + t2) { + /* dump before panic */ + printf("lfs_shellsort: nmemb=%d, size=%d\n", + nmemb, size); + incr = 0; + for (t1 = 0; t1 < nmemb; t1++) { + const struct buf *bp = bp_array[t1]; + + printf("bp[%d]: lbn=%" PRIu64 ", size=%" + PRIu64 "\n", t1, + (uint64_t)bp->b_bcount, + (uint64_t)bp->b_lblkno); + printf("lbns:"); + for (t2 = 0; t2 * size < bp->b_bcount; + t2++) { + printf(" %" PRId32, + lb_array[incr++]); + } + printf("\n"); + } + panic("lfs_shellsort: inconsistent input"); + } + } + } +#endif + + for (incrp = __rsshell_increments; (incr = *incrp++) != 0;) + for (t1 = incr; t1 < nmemb; ++t1) + for (t2 = t1 - incr; t2 >= 0;) + if ((u_int32_t)bp_array[t2]->b_lblkno > + (u_int32_t)bp_array[t2 + incr]->b_lblkno) { + bp_temp = bp_array[t2]; + bp_array[t2] = bp_array[t2 + incr]; + bp_array[t2 + incr] = bp_temp; + t2 -= incr; + } else + break; + + /* Reform the list of logical blocks */ + incr = 0; + for (t1 = 0; t1 < nmemb; t1++) { + for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) { + lb_array[incr++] = bp_array[t1]->b_lblkno + t2; + } + } +} + +/* + * Call vget with LK_NOWAIT. If we are the one who holds VI_XLOCK, + * however, we must press on. Just fake success in that case. + */ +int +lfs_vref(struct vnode *vp) +{ + int error; + struct lfs *fs; + + KASSERT(mutex_owned(vp->v_interlock)); + + fs = VTOI(vp)->i_lfs; + + ASSERT_MAYBE_SEGLOCK(fs); + + /* + * If we return 1 here during a flush, we risk vinvalbuf() not + * being able to flush all of the pages from this vnode, which + * will cause it to panic. So, return 0 if a flush is in progress. + */ + error = vget(vp, LK_NOWAIT); + if (error == EBUSY && IS_FLUSHING(VTOI(vp)->i_lfs, vp)) { + ++fs->lfs_flushvp_fakevref; + return 0; + } + return error; +} + +/* + * This is vrele except that we do not want to VOP_INACTIVE this vnode. We + * inline vrele here to avoid the vn_lock and VOP_INACTIVE call at the end. + */ +void +lfs_vunref(struct vnode *vp) +{ + struct lfs *fs; + + fs = VTOI(vp)->i_lfs; + ASSERT_MAYBE_SEGLOCK(fs); + + /* + * Analogous to lfs_vref, if the node is flushing, fake it. + */ + if (IS_FLUSHING(fs, vp) && fs->lfs_flushvp_fakevref) { + --fs->lfs_flushvp_fakevref; + return; + } + + /* does not call inactive */ + mutex_enter(vp->v_interlock); + vrelel(vp, 0); +} + +/* + * We use this when we have vnodes that were loaded in solely for cleaning. + * There is no reason to believe that these vnodes will be referenced again + * soon, since the cleaning process is unrelated to normal filesystem + * activity. Putting cleaned vnodes at the tail of the list has the effect + * of flushing the vnode LRU. So, put vnodes that were loaded only for + * cleaning at the head of the list, instead. + */ +void +lfs_vunref_head(struct vnode *vp) +{ + + ASSERT_SEGLOCK(VTOI(vp)->i_lfs); + + /* does not call inactive, inserts non-held vnode at head of freelist */ + mutex_enter(vp->v_interlock); + vrelel(vp, 0); +} + + +/* + * Set up an FINFO entry for a new file. The fip pointer is assumed to + * point at uninitialized space. + */ +void +lfs_acquire_finfo(struct lfs *fs, ino_t ino, int vers) +{ + struct segment *sp = fs->lfs_sp; + + KASSERT(vers > 0); + + if (sp->seg_bytes_left < fs->lfs_bsize || + sp->sum_bytes_left < sizeof(struct finfo)) + (void) lfs_writeseg(fs, fs->lfs_sp); + + sp->sum_bytes_left -= FINFOSIZE; + ++((SEGSUM *)(sp->segsum))->ss_nfinfo; + sp->fip->fi_nblocks = 0; + sp->fip->fi_ino = ino; + sp->fip->fi_version = vers; +} + +/* + * Release the FINFO entry, either clearing out an unused entry or + * advancing us to the next available entry. + */ +void +lfs_release_finfo(struct lfs *fs) +{ + struct segment *sp = fs->lfs_sp; + + if (sp->fip->fi_nblocks != 0) { + sp->fip = (FINFO*)((char *)sp->fip + FINFOSIZE + + sizeof(int32_t) * sp->fip->fi_nblocks); + sp->start_lbp = &sp->fip->fi_blocks[0]; + } else { + sp->sum_bytes_left += FINFOSIZE; + --((SEGSUM *)(sp->segsum))->ss_nfinfo; + } +} diff --git a/sys/ufs/lfs/lfs_subr.c b/sys/ufs/lfs/lfs_subr.c new file mode 100644 index 000000000..4da38aae3 --- /dev/null +++ b/sys/ufs/lfs/lfs_subr.c @@ -0,0 +1,661 @@ +/* $NetBSD: lfs_subr.c,v 1.76 2010/06/25 10:03:52 hannken Exp $ */ + +/*- + * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Konrad E. Schroder . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_subr.c 8.4 (Berkeley) 5/8/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.76 2010/06/25 10:03:52 hannken Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#ifdef DEBUG +const char *lfs_res_names[LFS_NB_COUNT] = { + "summary", + "superblock", + "file block", + "cluster", + "clean", + "blkiov", +}; +#endif + +int lfs_res_qty[LFS_NB_COUNT] = { + LFS_N_SUMMARIES, + LFS_N_SBLOCKS, + LFS_N_IBLOCKS, + LFS_N_CLUSTERS, + LFS_N_CLEAN, + LFS_N_BLKIOV, +}; + +void +lfs_setup_resblks(struct lfs *fs) +{ + int i, j; + int maxbpp; + + ASSERT_NO_SEGLOCK(fs); + fs->lfs_resblk = (res_t *)malloc(LFS_N_TOTAL * sizeof(res_t), M_SEGMENT, + M_WAITOK); + for (i = 0; i < LFS_N_TOTAL; i++) { + fs->lfs_resblk[i].inuse = 0; + fs->lfs_resblk[i].p = NULL; + } + for (i = 0; i < LFS_RESHASH_WIDTH; i++) + LIST_INIT(fs->lfs_reshash + i); + + /* + * These types of allocations can be larger than a page, + * so we can't use the pool subsystem for them. + */ + for (i = 0, j = 0; j < LFS_N_SUMMARIES; j++, i++) + fs->lfs_resblk[i].size = fs->lfs_sumsize; + for (j = 0; j < LFS_N_SBLOCKS; j++, i++) + fs->lfs_resblk[i].size = LFS_SBPAD; + for (j = 0; j < LFS_N_IBLOCKS; j++, i++) + fs->lfs_resblk[i].size = fs->lfs_bsize; + for (j = 0; j < LFS_N_CLUSTERS; j++, i++) + fs->lfs_resblk[i].size = MAXPHYS; + for (j = 0; j < LFS_N_CLEAN; j++, i++) + fs->lfs_resblk[i].size = MAXPHYS; + for (j = 0; j < LFS_N_BLKIOV; j++, i++) + fs->lfs_resblk[i].size = LFS_MARKV_MAXBLKCNT * sizeof(BLOCK_INFO); + + for (i = 0; i < LFS_N_TOTAL; i++) { + fs->lfs_resblk[i].p = malloc(fs->lfs_resblk[i].size, + M_SEGMENT, M_WAITOK); + } + + /* + * Initialize pools for small types (XXX is BPP small?) + */ + pool_init(&fs->lfs_clpool, sizeof(struct lfs_cluster), 0, 0, 0, + "lfsclpl", &pool_allocator_nointr, IPL_NONE); + pool_init(&fs->lfs_segpool, sizeof(struct segment), 0, 0, 0, + "lfssegpool", &pool_allocator_nointr, IPL_NONE); + maxbpp = ((fs->lfs_sumsize - SEGSUM_SIZE(fs)) / sizeof(int32_t) + 2); + maxbpp = MIN(maxbpp, segsize(fs) / fs->lfs_fsize + 2); + pool_init(&fs->lfs_bpppool, maxbpp * sizeof(struct buf *), 0, 0, 0, + "lfsbpppl", &pool_allocator_nointr, IPL_NONE); +} + +void +lfs_free_resblks(struct lfs *fs) +{ + int i; + + pool_destroy(&fs->lfs_bpppool); + pool_destroy(&fs->lfs_segpool); + pool_destroy(&fs->lfs_clpool); + + mutex_enter(&lfs_lock); + for (i = 0; i < LFS_N_TOTAL; i++) { + while (fs->lfs_resblk[i].inuse) + mtsleep(&fs->lfs_resblk, PRIBIO + 1, "lfs_free", 0, + &lfs_lock); + if (fs->lfs_resblk[i].p != NULL) + free(fs->lfs_resblk[i].p, M_SEGMENT); + } + free(fs->lfs_resblk, M_SEGMENT); + mutex_exit(&lfs_lock); +} + +static unsigned int +lfs_mhash(void *vp) +{ + return (unsigned int)(((unsigned long)vp) >> 2) % LFS_RESHASH_WIDTH; +} + +/* + * Return memory of the given size for the given purpose, or use one of a + * number of spare last-resort buffers, if malloc returns NULL. + */ +void * +lfs_malloc(struct lfs *fs, size_t size, int type) +{ + struct lfs_res_blk *re; + void *r; + int i, s, start; + unsigned int h; + + ASSERT_MAYBE_SEGLOCK(fs); + r = NULL; + + /* If no mem allocated for this type, it just waits */ + if (lfs_res_qty[type] == 0) { + r = malloc(size, M_SEGMENT, M_WAITOK); + return r; + } + + /* Otherwise try a quick malloc, and if it works, great */ + if ((r = malloc(size, M_SEGMENT, M_NOWAIT)) != NULL) { + return r; + } + + /* + * If malloc returned NULL, we are forced to use one of our + * reserve blocks. We have on hand at least one summary block, + * at least one cluster block, at least one superblock, + * and several indirect blocks. + */ + + mutex_enter(&lfs_lock); + /* skip over blocks of other types */ + for (i = 0, start = 0; i < type; i++) + start += lfs_res_qty[i]; + while (r == NULL) { + for (i = 0; i < lfs_res_qty[type]; i++) { + if (fs->lfs_resblk[start + i].inuse == 0) { + re = fs->lfs_resblk + start + i; + re->inuse = 1; + r = re->p; + KASSERT(re->size >= size); + h = lfs_mhash(r); + s = splbio(); + LIST_INSERT_HEAD(&fs->lfs_reshash[h], re, res); + splx(s); + mutex_exit(&lfs_lock); + return r; + } + } + DLOG((DLOG_MALLOC, "sleeping on %s (%d)\n", + lfs_res_names[type], lfs_res_qty[type])); + mtsleep(&fs->lfs_resblk, PVM, "lfs_malloc", 0, + &lfs_lock); + DLOG((DLOG_MALLOC, "done sleeping on %s\n", + lfs_res_names[type])); + } + /* NOTREACHED */ + mutex_exit(&lfs_lock); + return r; +} + +void +lfs_free(struct lfs *fs, void *p, int type) +{ + int s; + unsigned int h; + res_t *re; +#ifdef DEBUG + int i; +#endif + + ASSERT_MAYBE_SEGLOCK(fs); + h = lfs_mhash(p); + mutex_enter(&lfs_lock); + s = splbio(); + LIST_FOREACH(re, &fs->lfs_reshash[h], res) { + if (re->p == p) { + KASSERT(re->inuse == 1); + LIST_REMOVE(re, res); + re->inuse = 0; + wakeup(&fs->lfs_resblk); + splx(s); + mutex_exit(&lfs_lock); + return; + } + } +#ifdef DEBUG + for (i = 0; i < LFS_N_TOTAL; i++) { + if (fs->lfs_resblk[i].p == p) + panic("lfs_free: inconsistent reserved block"); + } +#endif + splx(s); + mutex_exit(&lfs_lock); + + /* + * If we didn't find it, free it. + */ + free(p, M_SEGMENT); +} + +/* + * lfs_seglock -- + * Single thread the segment writer. + */ +int +lfs_seglock(struct lfs *fs, unsigned long flags) +{ + struct segment *sp; + + mutex_enter(&lfs_lock); + if (fs->lfs_seglock) { + if (fs->lfs_lockpid == curproc->p_pid && + fs->lfs_locklwp == curlwp->l_lid) { + ++fs->lfs_seglock; + fs->lfs_sp->seg_flags |= flags; + mutex_exit(&lfs_lock); + return 0; + } else if (flags & SEGM_PAGEDAEMON) { + mutex_exit(&lfs_lock); + return EWOULDBLOCK; + } else { + while (fs->lfs_seglock) { + (void)mtsleep(&fs->lfs_seglock, PRIBIO + 1, + "lfs_seglock", 0, &lfs_lock); + } + } + } + + fs->lfs_seglock = 1; + fs->lfs_lockpid = curproc->p_pid; + fs->lfs_locklwp = curlwp->l_lid; + mutex_exit(&lfs_lock); + fs->lfs_cleanind = 0; + +#ifdef DEBUG + LFS_ENTER_LOG("seglock", __FILE__, __LINE__, 0, flags, curproc->p_pid); +#endif + /* Drain fragment size changes out */ + rw_enter(&fs->lfs_fraglock, RW_WRITER); + + sp = fs->lfs_sp = pool_get(&fs->lfs_segpool, PR_WAITOK); + sp->bpp = pool_get(&fs->lfs_bpppool, PR_WAITOK); + sp->seg_flags = flags; + sp->vp = NULL; + sp->seg_iocount = 0; + (void) lfs_initseg(fs); + + /* + * Keep a cumulative count of the outstanding I/O operations. If the + * disk drive catches up with us it could go to zero before we finish, + * so we artificially increment it by one until we've scheduled all of + * the writes we intend to do. + */ + mutex_enter(&lfs_lock); + ++fs->lfs_iocount; + mutex_exit(&lfs_lock); + return 0; +} + +static void lfs_unmark_dirop(struct lfs *); + +static void +lfs_unmark_dirop(struct lfs *fs) +{ + struct inode *ip, *nip; + struct vnode *vp; + int doit; + + ASSERT_NO_SEGLOCK(fs); + mutex_enter(&lfs_lock); + doit = !(fs->lfs_flags & LFS_UNDIROP); + if (doit) + fs->lfs_flags |= LFS_UNDIROP; + if (!doit) { + mutex_exit(&lfs_lock); + return; + } + + for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) { + nip = TAILQ_NEXT(ip, i_lfs_dchain); + vp = ITOV(ip); + if ((VTOI(vp)->i_flag & (IN_ADIROP | IN_ALLMOD)) == 0) { + --lfs_dirvcount; + --fs->lfs_dirvcount; + vp->v_uflag &= ~VU_DIROP; + TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain); + wakeup(&lfs_dirvcount); + fs->lfs_unlockvp = vp; + mutex_exit(&lfs_lock); + vrele(vp); + mutex_enter(&lfs_lock); + fs->lfs_unlockvp = NULL; + } + } + + fs->lfs_flags &= ~LFS_UNDIROP; + wakeup(&fs->lfs_flags); + mutex_exit(&lfs_lock); +} + +static void +lfs_auto_segclean(struct lfs *fs) +{ + int i, error, s, waited; + + ASSERT_SEGLOCK(fs); + /* + * Now that we've swapped lfs_activesb, but while we still + * hold the segment lock, run through the segment list marking + * the empty ones clean. + * XXX - do we really need to do them all at once? + */ + waited = 0; + for (i = 0; i < fs->lfs_nseg; i++) { + if ((fs->lfs_suflags[0][i] & + (SEGUSE_ACTIVE | SEGUSE_DIRTY | SEGUSE_EMPTY)) == + (SEGUSE_DIRTY | SEGUSE_EMPTY) && + (fs->lfs_suflags[1][i] & + (SEGUSE_ACTIVE | SEGUSE_DIRTY | SEGUSE_EMPTY)) == + (SEGUSE_DIRTY | SEGUSE_EMPTY)) { + + /* Make sure the sb is written before we clean */ + mutex_enter(&lfs_lock); + s = splbio(); + while (waited == 0 && fs->lfs_sbactive) + mtsleep(&fs->lfs_sbactive, PRIBIO+1, "lfs asb", + 0, &lfs_lock); + splx(s); + mutex_exit(&lfs_lock); + waited = 1; + + if ((error = lfs_do_segclean(fs, i)) != 0) { + DLOG((DLOG_CLEAN, "lfs_auto_segclean: lfs_do_segclean returned %d for seg %d\n", error, i)); + } + } + fs->lfs_suflags[1 - fs->lfs_activesb][i] = + fs->lfs_suflags[fs->lfs_activesb][i]; + } +} + +/* + * lfs_segunlock -- + * Single thread the segment writer. + */ +void +lfs_segunlock(struct lfs *fs) +{ + struct segment *sp; + unsigned long sync, ckp; + struct buf *bp; + int do_unmark_dirop = 0; + + sp = fs->lfs_sp; + + mutex_enter(&lfs_lock); + KASSERT(LFS_SEGLOCK_HELD(fs)); + if (fs->lfs_seglock == 1) { + if ((sp->seg_flags & (SEGM_PROT | SEGM_CLEAN)) == 0 && + LFS_STARVED_FOR_SEGS(fs) == 0) + do_unmark_dirop = 1; + mutex_exit(&lfs_lock); + sync = sp->seg_flags & SEGM_SYNC; + ckp = sp->seg_flags & SEGM_CKP; + + /* We should have a segment summary, and nothing else */ + KASSERT(sp->cbpp == sp->bpp + 1); + + /* Free allocated segment summary */ + fs->lfs_offset -= btofsb(fs, fs->lfs_sumsize); + bp = *sp->bpp; + lfs_freebuf(fs, bp); + + pool_put(&fs->lfs_bpppool, sp->bpp); + sp->bpp = NULL; + + /* + * If we're not sync, we're done with sp, get rid of it. + * Otherwise, we keep a local copy around but free + * fs->lfs_sp so another process can use it (we have to + * wait but they don't have to wait for us). + */ + if (!sync) + pool_put(&fs->lfs_segpool, sp); + fs->lfs_sp = NULL; + + /* + * If the I/O count is non-zero, sleep until it reaches zero. + * At the moment, the user's process hangs around so we can + * sleep. + */ + mutex_enter(&lfs_lock); + if (--fs->lfs_iocount == 0) { + LFS_DEBUG_COUNTLOCKED("lfs_segunlock"); + } + if (fs->lfs_iocount <= 1) + wakeup(&fs->lfs_iocount); + mutex_exit(&lfs_lock); + /* + * If we're not checkpointing, we don't have to block + * other processes to wait for a synchronous write + * to complete. + */ + if (!ckp) { +#ifdef DEBUG + LFS_ENTER_LOG("segunlock_std", __FILE__, __LINE__, 0, 0, curproc->p_pid); +#endif + mutex_enter(&lfs_lock); + --fs->lfs_seglock; + fs->lfs_lockpid = 0; + fs->lfs_locklwp = 0; + mutex_exit(&lfs_lock); + wakeup(&fs->lfs_seglock); + } + /* + * We let checkpoints happen asynchronously. That means + * that during recovery, we have to roll forward between + * the two segments described by the first and second + * superblocks to make sure that the checkpoint described + * by a superblock completed. + */ + mutex_enter(&lfs_lock); + while (ckp && sync && fs->lfs_iocount) { + (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1, + "lfs_iocount", 0, &lfs_lock); + DLOG((DLOG_SEG, "sleeping on iocount %x == %d\n", fs, fs->lfs_iocount)); + } + while (sync && sp->seg_iocount) { + (void)mtsleep(&sp->seg_iocount, PRIBIO + 1, + "seg_iocount", 0, &lfs_lock); + DLOG((DLOG_SEG, "sleeping on iocount %x == %d\n", sp, sp->seg_iocount)); + } + mutex_exit(&lfs_lock); + if (sync) + pool_put(&fs->lfs_segpool, sp); + + if (ckp) { + fs->lfs_nactive = 0; + /* If we *know* everything's on disk, write both sbs */ + /* XXX should wait for this one */ + if (sync) + lfs_writesuper(fs, fs->lfs_sboffs[fs->lfs_activesb]); + lfs_writesuper(fs, fs->lfs_sboffs[1 - fs->lfs_activesb]); + if (!(fs->lfs_ivnode->v_mount->mnt_iflag & IMNT_UNMOUNT)) { + lfs_auto_segclean(fs); + /* If sync, we can clean the remainder too */ + if (sync) + lfs_auto_segclean(fs); + } + fs->lfs_activesb = 1 - fs->lfs_activesb; +#ifdef DEBUG + LFS_ENTER_LOG("segunlock_ckp", __FILE__, __LINE__, 0, 0, curproc->p_pid); +#endif + mutex_enter(&lfs_lock); + --fs->lfs_seglock; + fs->lfs_lockpid = 0; + fs->lfs_locklwp = 0; + mutex_exit(&lfs_lock); + wakeup(&fs->lfs_seglock); + } + /* Reenable fragment size changes */ + rw_exit(&fs->lfs_fraglock); + if (do_unmark_dirop) + lfs_unmark_dirop(fs); + } else if (fs->lfs_seglock == 0) { + mutex_exit(&lfs_lock); + panic ("Seglock not held"); + } else { + --fs->lfs_seglock; + mutex_exit(&lfs_lock); + } +} + +/* + * Drain dirops and start writer. + * + * No simple_locks are held when we enter and none are held when we return. + */ +int +lfs_writer_enter(struct lfs *fs, const char *wmesg) +{ + int error = 0; + + ASSERT_MAYBE_SEGLOCK(fs); + mutex_enter(&lfs_lock); + + /* disallow dirops during flush */ + fs->lfs_writer++; + + while (fs->lfs_dirops > 0) { + ++fs->lfs_diropwait; + error = mtsleep(&fs->lfs_writer, PRIBIO+1, wmesg, 0, + &lfs_lock); + --fs->lfs_diropwait; + } + + if (error) + fs->lfs_writer--; + + mutex_exit(&lfs_lock); + + return error; +} + +void +lfs_writer_leave(struct lfs *fs) +{ + bool dowakeup; + + ASSERT_MAYBE_SEGLOCK(fs); + mutex_enter(&lfs_lock); + dowakeup = !(--fs->lfs_writer); + mutex_exit(&lfs_lock); + if (dowakeup) + wakeup(&fs->lfs_dirops); +} + +/* + * Unlock, wait for the cleaner, then relock to where we were before. + * To be used only at a fairly high level, to address a paucity of free + * segments propagated back from lfs_gop_write(). + */ +void +lfs_segunlock_relock(struct lfs *fs) +{ + int n = fs->lfs_seglock; + u_int16_t seg_flags; + CLEANERINFO *cip; + struct buf *bp; + + if (n == 0) + return; + + /* Write anything we've already gathered to disk */ + lfs_writeseg(fs, fs->lfs_sp); + + /* Tell cleaner */ + LFS_CLEANERINFO(cip, fs, bp); + cip->flags |= LFS_CLEANER_MUST_CLEAN; + LFS_SYNC_CLEANERINFO(cip, fs, bp, 1); + + /* Save segment flags for later */ + seg_flags = fs->lfs_sp->seg_flags; + + fs->lfs_sp->seg_flags |= SEGM_PROT; /* Don't unmark dirop nodes */ + while(fs->lfs_seglock) + lfs_segunlock(fs); + + /* Wait for the cleaner */ + lfs_wakeup_cleaner(fs); + mutex_enter(&lfs_lock); + while (LFS_STARVED_FOR_SEGS(fs)) + mtsleep(&fs->lfs_avail, PRIBIO, "relock", 0, + &lfs_lock); + mutex_exit(&lfs_lock); + + /* Put the segment lock back the way it was. */ + while(n--) + lfs_seglock(fs, seg_flags); + + /* Cleaner can relax now */ + LFS_CLEANERINFO(cip, fs, bp); + cip->flags &= ~LFS_CLEANER_MUST_CLEAN; + LFS_SYNC_CLEANERINFO(cip, fs, bp, 1); + + return; +} + +/* + * Wake up the cleaner, provided that nowrap is not set. + */ +void +lfs_wakeup_cleaner(struct lfs *fs) +{ + if (fs->lfs_nowrap > 0) + return; + + wakeup(&fs->lfs_nextseg); + wakeup(&lfs_allclean_wakeup); +} diff --git a/sys/ufs/lfs/lfs_syscalls.c b/sys/ufs/lfs/lfs_syscalls.c new file mode 100644 index 000000000..442b81d46 --- /dev/null +++ b/sys/ufs/lfs/lfs_syscalls.c @@ -0,0 +1,1224 @@ +/* $NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $ */ + +/*- + * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008 + * The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Konrad E. Schroder . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $"); + +#ifndef LFS +# define LFS /* for prototypes in syscallargs.h */ +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, void *); +int lfs_fasthashget(dev_t, ino_t, struct vnode **); + +pid_t lfs_cleaner_pid = 0; + +/* + * sys_lfs_markv: + * + * This will mark inodes and blocks dirty, so they are written into the log. + * It will block until all the blocks have been written. The segment create + * time passed in the block_info and inode_info structures is used to decide + * if the data is valid for each block (in case some process dirtied a block + * or inode that is being cleaned between the determination that a block is + * live and the lfs_markv call). + * + * 0 on success + * -1/errno is return on error. + */ +#ifdef USE_64BIT_SYSCALLS +int +sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval) +{ + /* { + syscallarg(fsid_t *) fsidp; + syscallarg(struct block_info *) blkiov; + syscallarg(int) blkcnt; + } */ + BLOCK_INFO *blkiov; + int blkcnt, error; + fsid_t fsid; + struct lfs *fs; + struct mount *mntp; + + if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, + NULL)) != 0) + return (error); + + if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) + return (error); + + if ((mntp = vfs_getvfs(fsidp)) == NULL) + return (ENOENT); + fs = VFSTOUFS(mntp)->um_lfs; + + blkcnt = SCARG(uap, blkcnt); + if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) + return (EINVAL); + + KERNEL_LOCK(1, NULL); + blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); + if ((error = copyin(SCARG(uap, blkiov), blkiov, + blkcnt * sizeof(BLOCK_INFO))) != 0) + goto out; + + if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0) + copyout(blkiov, SCARG(uap, blkiov), + blkcnt * sizeof(BLOCK_INFO)); + out: + lfs_free(fs, blkiov, LFS_NB_BLKIOV); + KERNEL_UNLOCK_ONE(NULL); + return error; +} +#else +int +sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval) +{ + /* { + syscallarg(fsid_t *) fsidp; + syscallarg(struct block_info *) blkiov; + syscallarg(int) blkcnt; + } */ + BLOCK_INFO *blkiov; + BLOCK_INFO_15 *blkiov15; + int i, blkcnt, error; + fsid_t fsid; + struct lfs *fs; + struct mount *mntp; + + if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, + NULL)) != 0) + return (error); + + if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) + return (error); + + if ((mntp = vfs_getvfs(&fsid)) == NULL) + return (ENOENT); + fs = VFSTOUFS(mntp)->um_lfs; + + blkcnt = SCARG(uap, blkcnt); + if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) + return (EINVAL); + + KERNEL_LOCK(1, NULL); + blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); + blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV); + if ((error = copyin(SCARG(uap, blkiov), blkiov15, + blkcnt * sizeof(BLOCK_INFO_15))) != 0) + goto out; + + for (i = 0; i < blkcnt; i++) { + blkiov[i].bi_inode = blkiov15[i].bi_inode; + blkiov[i].bi_lbn = blkiov15[i].bi_lbn; + blkiov[i].bi_daddr = blkiov15[i].bi_daddr; + blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; + blkiov[i].bi_version = blkiov15[i].bi_version; + blkiov[i].bi_bp = blkiov15[i].bi_bp; + blkiov[i].bi_size = blkiov15[i].bi_size; + } + + if ((error = lfs_markv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) { + for (i = 0; i < blkcnt; i++) { + blkiov15[i].bi_inode = blkiov[i].bi_inode; + blkiov15[i].bi_lbn = blkiov[i].bi_lbn; + blkiov15[i].bi_daddr = blkiov[i].bi_daddr; + blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; + blkiov15[i].bi_version = blkiov[i].bi_version; + blkiov15[i].bi_bp = blkiov[i].bi_bp; + blkiov15[i].bi_size = blkiov[i].bi_size; + } + copyout(blkiov15, SCARG(uap, blkiov), + blkcnt * sizeof(BLOCK_INFO_15)); + } + out: + lfs_free(fs, blkiov, LFS_NB_BLKIOV); + lfs_free(fs, blkiov15, LFS_NB_BLKIOV); + KERNEL_UNLOCK_ONE(NULL); + return error; +} +#endif + +#define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS) + +int +lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, + int blkcnt) +{ + BLOCK_INFO *blkp; + IFILE *ifp; + struct buf *bp; + struct inode *ip = NULL; + struct lfs *fs; + struct mount *mntp; + struct vnode *vp = NULL; + ino_t lastino; + daddr_t b_daddr, v_daddr; + int cnt, error; + int do_again = 0; + int numrefed = 0; + ino_t maxino; + size_t obsize; + + /* number of blocks/inodes that we have already bwrite'ed */ + int nblkwritten, ninowritten; + + if ((mntp = vfs_getvfs(fsidp)) == NULL) + return (ENOENT); + + fs = VFSTOUFS(mntp)->um_lfs; + + if (fs->lfs_ronly) + return EROFS; + + maxino = (fragstoblks(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks) - + fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb; + + cnt = blkcnt; + + if ((error = vfs_busy(mntp, NULL)) != 0) + return (error); + + /* + * This seglock is just to prevent the fact that we might have to sleep + * from allowing the possibility that our blocks might become + * invalid. + * + * It is also important to note here that unless we specify SEGM_CKP, + * any Ifile blocks that we might be asked to clean will never get + * to the disk. + */ + lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); + + /* Mark blocks/inodes dirty. */ + error = 0; + + /* these were inside the initialization for the for loop */ + v_daddr = LFS_UNUSED_DADDR; + lastino = LFS_UNUSED_INUM; + nblkwritten = ninowritten = 0; + for (blkp = blkiov; cnt--; ++blkp) + { + /* Bounds-check incoming data, avoid panic for failed VGET */ + if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) { + error = EINVAL; + goto err3; + } + /* + * Get the IFILE entry (only once) and see if the file still + * exists. + */ + if (lastino != blkp->bi_inode) { + /* + * Finish the old file, if there was one. The presence + * of a usable vnode in vp is signaled by a valid v_daddr. + */ + if (v_daddr != LFS_UNUSED_DADDR) { + lfs_vunref(vp); + numrefed--; + } + + /* + * Start a new file + */ + lastino = blkp->bi_inode; + if (blkp->bi_inode == LFS_IFILE_INUM) + v_daddr = fs->lfs_idaddr; + else { + LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); + /* XXX fix for force write */ + v_daddr = ifp->if_daddr; + brelse(bp, 0); + } + if (v_daddr == LFS_UNUSED_DADDR) + continue; + + /* Get the vnode/inode. */ + error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr, + &vp, + (blkp->bi_lbn == LFS_UNUSED_LBN + ? blkp->bi_bp + : NULL)); + + if (!error) { + numrefed++; + } + if (error) { + DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget" + " failed with %d (ino %d, segment %d)\n", + error, blkp->bi_inode, + dtosn(fs, blkp->bi_daddr))); + /* + * If we got EAGAIN, that means that the + * Inode was locked. This is + * recoverable: just clean the rest of + * this segment, and let the cleaner try + * again with another. (When the + * cleaner runs again, this segment will + * sort high on the list, since it is + * now almost entirely empty.) But, we + * still set v_daddr = LFS_UNUSED_ADDR + * so as not to test this over and over + * again. + */ + if (error == EAGAIN) { + error = 0; + do_again++; + } +#ifdef DIAGNOSTIC + else if (error != ENOENT) + panic("lfs_markv VFS_VGET FAILED"); +#endif + /* lastino = LFS_UNUSED_INUM; */ + v_daddr = LFS_UNUSED_DADDR; + vp = NULL; + ip = NULL; + continue; + } + ip = VTOI(vp); + ninowritten++; + } else if (v_daddr == LFS_UNUSED_DADDR) { + /* + * This can only happen if the vnode is dead (or + * in any case we can't get it...e.g., it is + * inlocked). Keep going. + */ + continue; + } + + /* Past this point we are guaranteed that vp, ip are valid. */ + + /* Can't clean VU_DIROP directories in case of truncation */ + /* XXX - maybe we should mark removed dirs specially? */ + if (vp->v_type == VDIR && (vp->v_uflag & VU_DIROP)) { + do_again++; + continue; + } + + /* If this BLOCK_INFO didn't contain a block, keep going. */ + if (blkp->bi_lbn == LFS_UNUSED_LBN) { + /* XXX need to make sure that the inode gets written in this case */ + /* XXX but only write the inode if it's the right one */ + if (blkp->bi_inode != LFS_IFILE_INUM) { + LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); + if (ifp->if_daddr == blkp->bi_daddr) { + mutex_enter(&lfs_lock); + LFS_SET_UINO(ip, IN_CLEANING); + mutex_exit(&lfs_lock); + } + brelse(bp, 0); + } + continue; + } + + b_daddr = 0; + if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) || + dbtofsb(fs, b_daddr) != blkp->bi_daddr) + { + if (dtosn(fs, dbtofsb(fs, b_daddr)) == + dtosn(fs, blkp->bi_daddr)) + { + DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %llx vs %llx\n", + (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr))); + } + do_again++; + continue; + } + + /* + * Check block sizes. The blocks being cleaned come from + * disk, so they should have the same size as their on-disk + * counterparts. + */ + if (blkp->bi_lbn >= 0) + obsize = blksize(fs, ip, blkp->bi_lbn); + else + obsize = fs->lfs_bsize; + /* Check for fragment size change */ + if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) { + obsize = ip->i_lfs_fragsize[blkp->bi_lbn]; + } + if (obsize != blkp->bi_size) { + DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %lld wrong" + " size (%ld != %d), try again\n", + blkp->bi_inode, (long long)blkp->bi_lbn, + (long) obsize, blkp->bi_size)); + do_again++; + continue; + } + + /* + * If we get to here, then we are keeping the block. If + * it is an indirect block, we want to actually put it + * in the buffer cache so that it can be updated in the + * finish_meta section. If it's not, we need to + * allocate a fake buffer so that writeseg can perform + * the copyin and write the buffer. + */ + if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) { + /* Data Block */ + bp = lfs_fakebuf(fs, vp, blkp->bi_lbn, + blkp->bi_size, blkp->bi_bp); + /* Pretend we used bread() to get it */ + bp->b_blkno = fsbtodb(fs, blkp->bi_daddr); + } else { + /* Indirect block or ifile */ + if (blkp->bi_size != fs->lfs_bsize && + ip->i_number != LFS_IFILE_INUM) + panic("lfs_markv: partial indirect block?" + " size=%d\n", blkp->bi_size); + bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0); + if (!(bp->b_oflags & (BO_DONE|BO_DELWRI))) { + /* + * The block in question was not found + * in the cache; i.e., the block that + * getblk() returned is empty. So, we + * can (and should) copy in the + * contents, because we've already + * determined that this was the right + * version of this block on disk. + * + * And, it can't have changed underneath + * us, because we have the segment lock. + */ + error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size); + if (error) + goto err2; + } + } + if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0) + goto err2; + + nblkwritten++; + /* + * XXX should account indirect blocks and ifile pages as well + */ + if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode)) + > LFS_MARKV_MAX_BLOCKS) { + DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n", + nblkwritten, ninowritten)); + lfs_segwrite(mntp, SEGM_CLEAN); + nblkwritten = ninowritten = 0; + } + } + + /* + * Finish the old file, if there was one + */ + if (v_daddr != LFS_UNUSED_DADDR) { + lfs_vunref(vp); + numrefed--; + } + +#ifdef DIAGNOSTIC + if (numrefed != 0) + panic("lfs_markv: numrefed=%d", numrefed); +#endif + DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n", + nblkwritten, ninowritten)); + + /* + * The last write has to be SEGM_SYNC, because of calling semantics. + * It also has to be SEGM_CKP, because otherwise we could write + * over the newly cleaned data contained in a checkpoint, and then + * we'd be unhappy at recovery time. + */ + lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); + + lfs_segunlock(fs); + + vfs_unbusy(mntp, false, NULL); + if (error) + return (error); + else if (do_again) + return EAGAIN; + + return 0; + +err2: + DLOG((DLOG_CLEAN, "lfs_markv err2\n")); + + /* + * XXX we're here because copyin() failed. + * XXX it means that we can't trust the cleanerd. too bad. + * XXX how can we recover from this? + */ + +err3: + KERNEL_UNLOCK_ONE(NULL); + /* + * XXX should do segwrite here anyway? + */ + + if (v_daddr != LFS_UNUSED_DADDR) { + lfs_vunref(vp); + --numrefed; + } + + lfs_segunlock(fs); + vfs_unbusy(mntp, false, NULL); +#ifdef DIAGNOSTIC + if (numrefed != 0) + panic("lfs_markv: numrefed=%d", numrefed); +#endif + + return (error); +} + +/* + * sys_lfs_bmapv: + * + * This will fill in the current disk address for arrays of blocks. + * + * 0 on success + * -1/errno is return on error. + */ +#ifdef USE_64BIT_SYSCALLS +int +sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval) +{ + /* { + syscallarg(fsid_t *) fsidp; + syscallarg(struct block_info *) blkiov; + syscallarg(int) blkcnt; + } */ + BLOCK_INFO *blkiov; + int blkcnt, error; + fsid_t fsid; + struct lfs *fs; + struct mount *mntp; + + if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, + NULL)) != 0) + return (error); + + if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) + return (error); + + if ((mntp = vfs_getvfs(&fsid)) == NULL) + return (ENOENT); + fs = VFSTOUFS(mntp)->um_lfs; + + blkcnt = SCARG(uap, blkcnt); + if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) + return (EINVAL); + KERNEL_LOCK(1, NULL); + blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); + if ((error = copyin(SCARG(uap, blkiov), blkiov, + blkcnt * sizeof(BLOCK_INFO))) != 0) + goto out; + + if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0) + copyout(blkiov, SCARG(uap, blkiov), + blkcnt * sizeof(BLOCK_INFO)); + out: + lfs_free(fs, blkiov, LFS_NB_BLKIOV); + KERNEL_UNLOCK_ONE(NULL); + return error; +} +#else +int +sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval) +{ + /* { + syscallarg(fsid_t *) fsidp; + syscallarg(struct block_info *) blkiov; + syscallarg(int) blkcnt; + } */ + BLOCK_INFO *blkiov; + BLOCK_INFO_15 *blkiov15; + int i, blkcnt, error; + fsid_t fsid; + struct lfs *fs; + struct mount *mntp; + + if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, + NULL)) != 0) + return (error); + + if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) + return (error); + + if ((mntp = vfs_getvfs(&fsid)) == NULL) + return (ENOENT); + fs = VFSTOUFS(mntp)->um_lfs; + + blkcnt = SCARG(uap, blkcnt); + if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) + return (EINVAL); + KERNEL_LOCK(1, NULL); + blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); + blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV); + if ((error = copyin(SCARG(uap, blkiov), blkiov15, + blkcnt * sizeof(BLOCK_INFO_15))) != 0) + goto out; + + for (i = 0; i < blkcnt; i++) { + blkiov[i].bi_inode = blkiov15[i].bi_inode; + blkiov[i].bi_lbn = blkiov15[i].bi_lbn; + blkiov[i].bi_daddr = blkiov15[i].bi_daddr; + blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; + blkiov[i].bi_version = blkiov15[i].bi_version; + blkiov[i].bi_bp = blkiov15[i].bi_bp; + blkiov[i].bi_size = blkiov15[i].bi_size; + } + + if ((error = lfs_bmapv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) { + for (i = 0; i < blkcnt; i++) { + blkiov15[i].bi_inode = blkiov[i].bi_inode; + blkiov15[i].bi_lbn = blkiov[i].bi_lbn; + blkiov15[i].bi_daddr = blkiov[i].bi_daddr; + blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; + blkiov15[i].bi_version = blkiov[i].bi_version; + blkiov15[i].bi_bp = blkiov[i].bi_bp; + blkiov15[i].bi_size = blkiov[i].bi_size; + } + copyout(blkiov15, SCARG(uap, blkiov), + blkcnt * sizeof(BLOCK_INFO_15)); + } + out: + lfs_free(fs, blkiov, LFS_NB_BLKIOV); + lfs_free(fs, blkiov15, LFS_NB_BLKIOV); + KERNEL_UNLOCK_ONE(NULL); + return error; +} +#endif + +int +lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt) +{ + BLOCK_INFO *blkp; + IFILE *ifp; + struct buf *bp; + struct inode *ip = NULL; + struct lfs *fs; + struct mount *mntp; + struct ufsmount *ump; + struct vnode *vp; + ino_t lastino; + daddr_t v_daddr; + int cnt, error; + int numrefed = 0; + + lfs_cleaner_pid = p->p_pid; + + if ((mntp = vfs_getvfs(fsidp)) == NULL) + return (ENOENT); + + ump = VFSTOUFS(mntp); + if ((error = vfs_busy(mntp, NULL)) != 0) + return (error); + + cnt = blkcnt; + + fs = VFSTOUFS(mntp)->um_lfs; + + error = 0; + + /* these were inside the initialization for the for loop */ + v_daddr = LFS_UNUSED_DADDR; + lastino = LFS_UNUSED_INUM; + for (blkp = blkiov; cnt--; ++blkp) + { + /* + * Get the IFILE entry (only once) and see if the file still + * exists. + */ + if (lastino != blkp->bi_inode) { + /* + * Finish the old file, if there was one. The presence + * of a usable vnode in vp is signaled by a valid + * v_daddr. + */ + if (v_daddr != LFS_UNUSED_DADDR) { + lfs_vunref(vp); + numrefed--; + } + + /* + * Start a new file + */ + lastino = blkp->bi_inode; + if (blkp->bi_inode == LFS_IFILE_INUM) + v_daddr = fs->lfs_idaddr; + else { + LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); + v_daddr = ifp->if_daddr; + brelse(bp, 0); + } + if (v_daddr == LFS_UNUSED_DADDR) { + blkp->bi_daddr = LFS_UNUSED_DADDR; + continue; + } + /* + * A regular call to VFS_VGET could deadlock + * here. Instead, we try an unlocked access. + */ + mutex_enter(&ufs_ihash_lock); + vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode); + if (vp != NULL && !(vp->v_iflag & VI_XLOCK)) { + ip = VTOI(vp); + mutex_enter(vp->v_interlock); + mutex_exit(&ufs_ihash_lock); + if (lfs_vref(vp)) { + v_daddr = LFS_UNUSED_DADDR; + continue; + } + numrefed++; + } else { + mutex_exit(&ufs_ihash_lock); + /* + * Don't VFS_VGET if we're being unmounted, + * since we hold vfs_busy(). + */ + if (mntp->mnt_iflag & IMNT_UNMOUNT) { + v_daddr = LFS_UNUSED_DADDR; + continue; + } + error = VFS_VGET(mntp, blkp->bi_inode, &vp); + if (error) { + DLOG((DLOG_CLEAN, "lfs_bmapv: vget ino" + "%d failed with %d", + blkp->bi_inode,error)); + v_daddr = LFS_UNUSED_DADDR; + continue; + } else { + KASSERT(VOP_ISLOCKED(vp)); + VOP_UNLOCK(vp); + numrefed++; + } + } + ip = VTOI(vp); + } else if (v_daddr == LFS_UNUSED_DADDR) { + /* + * This can only happen if the vnode is dead. + * Keep going. Note that we DO NOT set the + * bi_addr to anything -- if we failed to get + * the vnode, for example, we want to assume + * conservatively that all of its blocks *are* + * located in the segment in question. + * lfs_markv will throw them out if we are + * wrong. + */ + /* blkp->bi_daddr = LFS_UNUSED_DADDR; */ + continue; + } + + /* Past this point we are guaranteed that vp, ip are valid. */ + + if (blkp->bi_lbn == LFS_UNUSED_LBN) { + /* + * We just want the inode address, which is + * conveniently in v_daddr. + */ + blkp->bi_daddr = v_daddr; + } else { + daddr_t bi_daddr; + + /* XXX ondisk32 */ + error = VOP_BMAP(vp, blkp->bi_lbn, NULL, + &bi_daddr, NULL); + if (error) + { + blkp->bi_daddr = LFS_UNUSED_DADDR; + continue; + } + blkp->bi_daddr = dbtofsb(fs, bi_daddr); + /* Fill in the block size, too */ + if (blkp->bi_lbn >= 0) + blkp->bi_size = blksize(fs, ip, blkp->bi_lbn); + else + blkp->bi_size = fs->lfs_bsize; + } + } + + /* + * Finish the old file, if there was one. The presence + * of a usable vnode in vp is signaled by a valid v_daddr. + */ + if (v_daddr != LFS_UNUSED_DADDR) { + lfs_vunref(vp); + numrefed--; + } + +#ifdef DIAGNOSTIC + if (numrefed != 0) + panic("lfs_bmapv: numrefed=%d", numrefed); +#endif + + vfs_unbusy(mntp, false, NULL); + + return 0; +} + +/* + * sys_lfs_segclean: + * + * Mark the segment clean. + * + * 0 on success + * -1/errno is return on error. + */ +int +sys_lfs_segclean(struct lwp *l, const struct sys_lfs_segclean_args *uap, register_t *retval) +{ + /* { + syscallarg(fsid_t *) fsidp; + syscallarg(u_long) segment; + } */ + struct lfs *fs; + struct mount *mntp; + fsid_t fsid; + int error; + unsigned long segnum; + + if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, + NULL)) != 0) + return (error); + + if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) + return (error); + if ((mntp = vfs_getvfs(&fsid)) == NULL) + return (ENOENT); + + fs = VFSTOUFS(mntp)->um_lfs; + segnum = SCARG(uap, segment); + + if ((error = vfs_busy(mntp, NULL)) != 0) + return (error); + + KERNEL_LOCK(1, NULL); + lfs_seglock(fs, SEGM_PROT); + error = lfs_do_segclean(fs, segnum); + lfs_segunlock(fs); + KERNEL_UNLOCK_ONE(NULL); + vfs_unbusy(mntp, false, NULL); + return error; +} + +/* + * Actually mark the segment clean. + * Must be called with the segment lock held. + */ +int +lfs_do_segclean(struct lfs *fs, unsigned long segnum) +{ + extern int lfs_dostats; + struct buf *bp; + CLEANERINFO *cip; + SEGUSE *sup; + + if (dtosn(fs, fs->lfs_curseg) == segnum) { + return (EBUSY); + } + + LFS_SEGENTRY(sup, fs, segnum, bp); + if (sup->su_nbytes) { + DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" + " %d live bytes\n", segnum, sup->su_nbytes)); + brelse(bp, 0); + return (EBUSY); + } + if (sup->su_flags & SEGUSE_ACTIVE) { + DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" + " segment is active\n", segnum)); + brelse(bp, 0); + return (EBUSY); + } + if (!(sup->su_flags & SEGUSE_DIRTY)) { + DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" + " segment is already clean\n", segnum)); + brelse(bp, 0); + return (EALREADY); + } + + fs->lfs_avail += segtod(fs, 1); + if (sup->su_flags & SEGUSE_SUPERBLOCK) + fs->lfs_avail -= btofsb(fs, LFS_SBPAD); + if (fs->lfs_version > 1 && segnum == 0 && + fs->lfs_start < btofsb(fs, LFS_LABELPAD)) + fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start; + mutex_enter(&lfs_lock); + fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) + + btofsb(fs, sup->su_ninos * fs->lfs_ibsize); + fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) + + btofsb(fs, sup->su_ninos * fs->lfs_ibsize); + if (fs->lfs_dmeta < 0) + fs->lfs_dmeta = 0; + mutex_exit(&lfs_lock); + sup->su_flags &= ~SEGUSE_DIRTY; + LFS_WRITESEGENTRY(sup, fs, segnum, bp); + + LFS_CLEANERINFO(cip, fs, bp); + ++cip->clean; + --cip->dirty; + fs->lfs_nclean = cip->clean; + cip->bfree = fs->lfs_bfree; + mutex_enter(&lfs_lock); + cip->avail = fs->lfs_avail - fs->lfs_ravail - fs->lfs_favail; + wakeup(&fs->lfs_avail); + mutex_exit(&lfs_lock); + (void) LFS_BWRITE_LOG(bp); + + if (lfs_dostats) + ++lfs_stats.segs_reclaimed; + + return (0); +} + +/* + * This will block until a segment in file system fsid is written. A timeout + * in milliseconds may be specified which will awake the cleaner automatically. + * An fsid of -1 means any file system, and a timeout of 0 means forever. + */ +int +lfs_segwait(fsid_t *fsidp, struct timeval *tv) +{ + struct mount *mntp; + void *addr; + u_long timeout; + int error; + + KERNEL_LOCK(1, NULL); + if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL) + addr = &lfs_allclean_wakeup; + else + addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg; + /* + * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}! + * XXX IS THAT WHAT IS INTENDED? + */ + timeout = tvtohz(tv); + error = tsleep(addr, PCATCH | PVFS, "segment", timeout); + KERNEL_UNLOCK_ONE(NULL); + return (error == ERESTART ? EINTR : 0); +} + +/* + * sys_lfs_segwait: + * + * System call wrapper around lfs_segwait(). + * + * 0 on success + * 1 on timeout + * -1/errno is return on error. + */ +int +sys___lfs_segwait50(struct lwp *l, const struct sys___lfs_segwait50_args *uap, + register_t *retval) +{ + /* { + syscallarg(fsid_t *) fsidp; + syscallarg(struct timeval *) tv; + } */ + struct timeval atv; + fsid_t fsid; + int error; + + /* XXX need we be su to segwait? */ + if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, + NULL)) != 0) + return (error); + if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) + return (error); + + if (SCARG(uap, tv)) { + error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval)); + if (error) + return (error); + if (itimerfix(&atv)) + return (EINVAL); + } else /* NULL or invalid */ + atv.tv_sec = atv.tv_usec = 0; + return lfs_segwait(&fsid, &atv); +} + +/* + * VFS_VGET call specialized for the cleaner. The cleaner already knows the + * daddr from the ifile, so don't look it up again. If the cleaner is + * processing IINFO structures, it may have the ondisk inode already, so + * don't go retrieving it again. + * + * we lfs_vref, and it is the caller's responsibility to lfs_vunref + * when finished. + */ + +int +lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp) +{ + struct vnode *vp; + + mutex_enter(&ufs_ihash_lock); + if ((vp = ufs_ihashlookup(dev, ino)) != NULL) { + mutex_enter(vp->v_interlock); + mutex_exit(&ufs_ihash_lock); + if (vp->v_iflag & VI_XLOCK) { + DLOG((DLOG_CLEAN, "lfs_fastvget: ino %d VI_XLOCK\n", + ino)); + lfs_stats.clean_vnlocked++; + mutex_exit(vp->v_interlock); + return EAGAIN; + } + if (lfs_vref(vp)) { + DLOG((DLOG_CLEAN, "lfs_fastvget: lfs_vref failed" + " for ino %d\n", ino)); + lfs_stats.clean_inlocked++; + return EAGAIN; + } + } else { + mutex_exit(&ufs_ihash_lock); + } + *vpp = vp; + + return (0); +} + +int +lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp, + struct ufs1_dinode *dinp) +{ + struct inode *ip; + struct ufs1_dinode *dip; + struct vnode *vp; + struct ufsmount *ump; + dev_t dev; + int error, retries; + struct buf *bp; + struct lfs *fs; + + ump = VFSTOUFS(mp); + dev = ump->um_dev; + fs = ump->um_lfs; + + /* + * Wait until the filesystem is fully mounted before allowing vget + * to complete. This prevents possible problems with roll-forward. + */ + mutex_enter(&lfs_lock); + while (fs->lfs_flags & LFS_NOTYET) { + mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0, + &lfs_lock); + } + mutex_exit(&lfs_lock); + + /* + * This is playing fast and loose. Someone may have the inode + * locked, in which case they are going to be distinctly unhappy + * if we trash something. + */ + + error = lfs_fasthashget(dev, ino, vpp); + if (error != 0 || *vpp != NULL) + return (error); + + /* + * getnewvnode(9) will call vfs_busy, which will block if the + * filesystem is being unmounted; but umount(9) is waiting for + * us because we're already holding the fs busy. + * XXXMP + */ + if (mp->mnt_iflag & IMNT_UNMOUNT) { + *vpp = NULL; + return EDEADLK; + } + error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, NULL, &vp); + if (error) { + *vpp = NULL; + return (error); + } + + mutex_enter(&ufs_hashlock); + error = lfs_fasthashget(dev, ino, vpp); + if (error != 0 || *vpp != NULL) { + mutex_exit(&ufs_hashlock); + ungetnewvnode(vp); + return (error); + } + + /* Allocate new vnode/inode. */ + lfs_vcreate(mp, ino, vp); + + /* + * Put it onto its hash chain and lock it so that other requests for + * this inode will block if they arrive while we are sleeping waiting + * for old data structures to be purged or for the contents of the + * disk portion of this inode to be read. + */ + ip = VTOI(vp); + ufs_ihashins(ip); + mutex_exit(&ufs_hashlock); + + /* + * XXX + * This may not need to be here, logically it should go down with + * the i_devvp initialization. + * Ask Kirk. + */ + ip->i_lfs = fs; + + /* Read in the disk contents for the inode, copy into the inode. */ + if (dinp) { + error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode)); + if (error) { + DLOG((DLOG_CLEAN, "lfs_fastvget: dinode copyin failed" + " for ino %d\n", ino)); + ufs_ihashrem(ip); + + /* Unlock and discard unneeded inode. */ + VOP_UNLOCK(vp); + lfs_vunref(vp); + *vpp = NULL; + return (error); + } + if (ip->i_number != ino) + panic("lfs_fastvget: I was fed the wrong inode!"); + } else { + retries = 0; + again: + error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize, + NOCRED, 0, &bp); + if (error) { + DLOG((DLOG_CLEAN, "lfs_fastvget: bread failed (%d)\n", + error)); + /* + * The inode does not contain anything useful, so it + * would be misleading to leave it on its hash chain. + * Iput() will return it to the free list. + */ + ufs_ihashrem(ip); + + /* Unlock and discard unneeded inode. */ + VOP_UNLOCK(vp); + lfs_vunref(vp); + brelse(bp, 0); + *vpp = NULL; + return (error); + } + dip = lfs_ifind(ump->um_lfs, ino, bp); + if (dip == NULL) { + /* Assume write has not completed yet; try again */ + brelse(bp, BC_INVAL); + ++retries; + if (retries > LFS_IFIND_RETRIES) + panic("lfs_fastvget: dinode not found"); + DLOG((DLOG_CLEAN, "lfs_fastvget: dinode not found," + " retrying...\n")); + goto again; + } + *ip->i_din.ffs1_din = *dip; + brelse(bp, 0); + } + lfs_vinit(mp, &vp); + + *vpp = vp; + + KASSERT(VOP_ISLOCKED(vp)); + VOP_UNLOCK(vp); + + return (0); +} + +/* + * Make up a "fake" cleaner buffer, copy the data from userland into it. + */ +struct buf * +lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, void *uaddr) +{ + struct buf *bp; + int error; + + KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM); + + bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN); + error = copyin(uaddr, bp->b_data, size); + if (error) { + lfs_freebuf(fs, bp); + return NULL; + } + KDASSERT(bp->b_iodone == lfs_callback); + +#if 0 + mutex_enter(&lfs_lock); + ++fs->lfs_iocount; + mutex_exit(&lfs_lock); +#endif + bp->b_bufsize = size; + bp->b_bcount = size; + return (bp); +} diff --git a/sys/ufs/lfs/lfs_vfsops.c b/sys/ufs/lfs/lfs_vfsops.c new file mode 100644 index 000000000..7769e94a1 --- /dev/null +++ b/sys/ufs/lfs/lfs_vfsops.c @@ -0,0 +1,2138 @@ +/* $NetBSD: lfs_vfsops.c,v 1.291 2011/11/14 18:35:14 hannken Exp $ */ + +/*- + * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007 + * The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Konrad E. Schroder . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/*- + * Copyright (c) 1989, 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_vfsops.c 8.20 (Berkeley) 6/10/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.291 2011/11/14 18:35:14 hannken Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_lfs.h" +#include "opt_quota.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +MODULE(MODULE_CLASS_VFS, lfs, "ffs"); + +static int lfs_gop_write(struct vnode *, struct vm_page **, int, int); +static bool lfs_issequential_hole(const struct ufsmount *, + daddr_t, daddr_t); + +static int lfs_mountfs(struct vnode *, struct mount *, struct lwp *); + +static struct sysctllog *lfs_sysctl_log; + +extern const struct vnodeopv_desc lfs_vnodeop_opv_desc; +extern const struct vnodeopv_desc lfs_specop_opv_desc; +extern const struct vnodeopv_desc lfs_fifoop_opv_desc; + +pid_t lfs_writer_daemon = 0; +int lfs_do_flush = 0; +#ifdef LFS_KERNEL_RFW +int lfs_do_rfw = 0; +#endif + +const struct vnodeopv_desc * const lfs_vnodeopv_descs[] = { + &lfs_vnodeop_opv_desc, + &lfs_specop_opv_desc, + &lfs_fifoop_opv_desc, + NULL, +}; + +struct vfsops lfs_vfsops = { + MOUNT_LFS, + sizeof (struct ufs_args), + lfs_mount, + ufs_start, + lfs_unmount, + ufs_root, + ufs_quotactl, + lfs_statvfs, + lfs_sync, + lfs_vget, + lfs_fhtovp, + lfs_vptofh, + lfs_init, + lfs_reinit, + lfs_done, + lfs_mountroot, + (int (*)(struct mount *, struct vnode *, struct timespec *)) eopnotsupp, + vfs_stdextattrctl, + (void *)eopnotsupp, /* vfs_suspendctl */ + genfs_renamelock_enter, + genfs_renamelock_exit, + (void *)eopnotsupp, + lfs_vnodeopv_descs, + 0, + { NULL, NULL }, +}; + +const struct genfs_ops lfs_genfsops = { + .gop_size = lfs_gop_size, + .gop_alloc = ufs_gop_alloc, + .gop_write = lfs_gop_write, + .gop_markupdate = ufs_gop_markupdate, +}; + +static const struct ufs_ops lfs_ufsops = { + .uo_itimes = NULL, + .uo_update = lfs_update, + .uo_truncate = lfs_truncate, + .uo_valloc = lfs_valloc, + .uo_vfree = lfs_vfree, + .uo_balloc = lfs_balloc, + .uo_unmark_vnode = lfs_unmark_vnode, +}; + +struct shortlong { + const char *sname; + const char *lname; +}; + +static int +sysctl_lfs_dostats(SYSCTLFN_ARGS) +{ + extern struct lfs_stats lfs_stats; + extern int lfs_dostats; + int error; + + error = sysctl_lookup(SYSCTLFN_CALL(rnode)); + if (error || newp == NULL) + return (error); + + if (lfs_dostats == 0) + memset(&lfs_stats, 0, sizeof(lfs_stats)); + + return (0); +} + +static void +lfs_sysctl_setup(struct sysctllog **clog) +{ + int i; + extern int lfs_writeindir, lfs_dostats, lfs_clean_vnhead, + lfs_fs_pagetrip, lfs_ignore_lazy_sync; +#ifdef DEBUG + extern int lfs_debug_log_subsys[DLOG_MAX]; + struct shortlong dlog_names[DLOG_MAX] = { /* Must match lfs.h ! */ + { "rollforward", "Debug roll-forward code" }, + { "alloc", "Debug inode allocation and free list" }, + { "avail", "Debug space-available-now accounting" }, + { "flush", "Debug flush triggers" }, + { "lockedlist", "Debug locked list accounting" }, + { "vnode_verbose", "Verbose per-vnode-written debugging" }, + { "vnode", "Debug vnode use during segment write" }, + { "segment", "Debug segment writing" }, + { "seguse", "Debug segment used-bytes accounting" }, + { "cleaner", "Debug cleaning routines" }, + { "mount", "Debug mount/unmount routines" }, + { "pagecache", "Debug UBC interactions" }, + { "dirop", "Debug directory-operation accounting" }, + { "malloc", "Debug private malloc accounting" }, + }; +#endif /* DEBUG */ + struct shortlong stat_names[] = { /* Must match lfs.h! */ + { "segsused", "Number of new segments allocated" }, + { "psegwrites", "Number of partial-segment writes" }, + { "psyncwrites", "Number of synchronous partial-segment" + " writes" }, + { "pcleanwrites", "Number of partial-segment writes by the" + " cleaner" }, + { "blocktot", "Number of blocks written" }, + { "cleanblocks", "Number of blocks written by the cleaner" }, + { "ncheckpoints", "Number of checkpoints made" }, + { "nwrites", "Number of whole writes" }, + { "nsync_writes", "Number of synchronous writes" }, + { "wait_exceeded", "Number of times writer waited for" + " cleaner" }, + { "write_exceeded", "Number of times writer invoked flush" }, + { "flush_invoked", "Number of times flush was invoked" }, + { "vflush_invoked", "Number of time vflush was called" }, + { "clean_inlocked", "Number of vnodes skipped for VI_XLOCK" }, + { "clean_vnlocked", "Number of vnodes skipped for vget failure" }, + { "segs_reclaimed", "Number of segments reclaimed" }, + }; + + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "vfs", NULL, + NULL, 0, NULL, 0, + CTL_VFS, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "lfs", + SYSCTL_DESCR("Log-structured file system"), + NULL, 0, NULL, 0, + CTL_VFS, 5, CTL_EOL); + /* + * XXX the "5" above could be dynamic, thereby eliminating one + * more instance of the "number to vfs" mapping problem, but + * "5" is the order as taken from sys/mount.h + */ + + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "flushindir", NULL, + NULL, 0, &lfs_writeindir, 0, + CTL_VFS, 5, LFS_WRITEINDIR, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "clean_vnhead", NULL, + NULL, 0, &lfs_clean_vnhead, 0, + CTL_VFS, 5, LFS_CLEAN_VNHEAD, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "dostats", + SYSCTL_DESCR("Maintain statistics on LFS operations"), + sysctl_lfs_dostats, 0, &lfs_dostats, 0, + CTL_VFS, 5, LFS_DOSTATS, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "pagetrip", + SYSCTL_DESCR("How many dirty pages in fs triggers" + " a flush"), + NULL, 0, &lfs_fs_pagetrip, 0, + CTL_VFS, 5, LFS_FS_PAGETRIP, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "ignore_lazy_sync", + SYSCTL_DESCR("Lazy Sync is ignored entirely"), + NULL, 0, &lfs_ignore_lazy_sync, 0, + CTL_VFS, 5, LFS_IGNORE_LAZY_SYNC, CTL_EOL); +#ifdef LFS_KERNEL_RFW + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "rfw", + SYSCTL_DESCR("Use in-kernel roll-forward on mount"), + NULL, 0, &lfs_do_rfw, 0, + CTL_VFS, 5, LFS_DO_RFW, CTL_EOL); +#endif + + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "stats", + SYSCTL_DESCR("Debugging options"), + NULL, 0, NULL, 0, + CTL_VFS, 5, LFS_STATS, CTL_EOL); + for (i = 0; i < sizeof(struct lfs_stats) / sizeof(u_int); i++) { + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READONLY, + CTLTYPE_INT, stat_names[i].sname, + SYSCTL_DESCR(stat_names[i].lname), + NULL, 0, &(((u_int *)&lfs_stats.segsused)[i]), + 0, CTL_VFS, 5, LFS_STATS, i, CTL_EOL); + } + +#ifdef DEBUG + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "debug", + SYSCTL_DESCR("Debugging options"), + NULL, 0, NULL, 0, + CTL_VFS, 5, LFS_DEBUGLOG, CTL_EOL); + for (i = 0; i < DLOG_MAX; i++) { + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, dlog_names[i].sname, + SYSCTL_DESCR(dlog_names[i].lname), + NULL, 0, &(lfs_debug_log_subsys[i]), 0, + CTL_VFS, 5, LFS_DEBUGLOG, i, CTL_EOL); + } +#endif +} + +/* old cleaner syscall interface. see VOP_FCNTL() */ +static const struct syscall_package lfs_syscalls[] = { + { SYS_lfs_bmapv, 0, (sy_call_t *)sys_lfs_bmapv }, + { SYS_lfs_markv, 0, (sy_call_t *)sys_lfs_markv }, + { SYS_lfs_segclean, 0, (sy_call_t *)sys___lfs_segwait50 }, + { 0, 0, NULL }, +}; + +static int +lfs_modcmd(modcmd_t cmd, void *arg) +{ + int error; + + switch (cmd) { + case MODULE_CMD_INIT: + error = syscall_establish(NULL, lfs_syscalls); + if (error) + return error; + error = vfs_attach(&lfs_vfsops); + if (error != 0) { + syscall_disestablish(NULL, lfs_syscalls); + break; + } + lfs_sysctl_setup(&lfs_sysctl_log); + break; + case MODULE_CMD_FINI: + error = vfs_detach(&lfs_vfsops); + if (error != 0) + break; + syscall_disestablish(NULL, lfs_syscalls); + sysctl_teardown(&lfs_sysctl_log); + break; + default: + error = ENOTTY; + break; + } + + return (error); +} + +/* + * XXX Same structure as FFS inodes? Should we share a common pool? + */ +struct pool lfs_inode_pool; +struct pool lfs_dinode_pool; +struct pool lfs_inoext_pool; +struct pool lfs_lbnentry_pool; + +/* + * The writer daemon. UVM keeps track of how many dirty pages we are holding + * in lfs_subsys_pages; the daemon flushes the filesystem when this value + * crosses the (user-defined) threshhold LFS_MAX_PAGES. + */ +static void +lfs_writerd(void *arg) +{ + struct mount *mp, *nmp; + struct lfs *fs; + int fsflags; + int loopcount; + + lfs_writer_daemon = curproc->p_pid; + + mutex_enter(&lfs_lock); + for (;;) { + mtsleep(&lfs_writer_daemon, PVM | PNORELOCK, "lfswriter", hz/10, + &lfs_lock); + + /* + * Look through the list of LFSs to see if any of them + * have requested pageouts. + */ + mutex_enter(&mountlist_lock); + for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; + mp = nmp) { + if (vfs_busy(mp, &nmp)) { + continue; + } + if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS, + sizeof(mp->mnt_stat.f_fstypename)) == 0) { + fs = VFSTOUFS(mp)->um_lfs; + mutex_enter(&lfs_lock); + fsflags = 0; + if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) || + lfs_dirvcount > LFS_MAX_DIROP) && + fs->lfs_dirops == 0) + fsflags |= SEGM_CKP; + if (fs->lfs_pdflush) { + DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n")); + fs->lfs_pdflush = 0; + lfs_flush_fs(fs, fsflags); + mutex_exit(&lfs_lock); + } else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) { + DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n")); + mutex_exit(&lfs_lock); + lfs_writer_enter(fs, "wrdirop"); + lfs_flush_pchain(fs); + lfs_writer_leave(fs); + } else + mutex_exit(&lfs_lock); + } + vfs_unbusy(mp, false, &nmp); + } + mutex_exit(&mountlist_lock); + + /* + * If global state wants a flush, flush everything. + */ + mutex_enter(&lfs_lock); + loopcount = 0; + if (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS || + locked_queue_bytes > LFS_MAX_BYTES || + lfs_subsys_pages > LFS_MAX_PAGES) { + + if (lfs_do_flush) { + DLOG((DLOG_FLUSH, "daemon: lfs_do_flush\n")); + } + if (locked_queue_count > LFS_MAX_BUFS) { + DLOG((DLOG_FLUSH, "daemon: lqc = %d, max %d\n", + locked_queue_count, LFS_MAX_BUFS)); + } + if (locked_queue_bytes > LFS_MAX_BYTES) { + DLOG((DLOG_FLUSH, "daemon: lqb = %ld, max %ld\n", + locked_queue_bytes, LFS_MAX_BYTES)); + } + if (lfs_subsys_pages > LFS_MAX_PAGES) { + DLOG((DLOG_FLUSH, "daemon: lssp = %d, max %d\n", + lfs_subsys_pages, LFS_MAX_PAGES)); + } + + lfs_flush(NULL, SEGM_WRITERD, 0); + lfs_do_flush = 0; + } + } + /* NOTREACHED */ +} + +/* + * Initialize the filesystem, most work done by ufs_init. + */ +void +lfs_init(void) +{ + + malloc_type_attach(M_SEGMENT); + pool_init(&lfs_inode_pool, sizeof(struct inode), 0, 0, 0, + "lfsinopl", &pool_allocator_nointr, IPL_NONE); + pool_init(&lfs_dinode_pool, sizeof(struct ufs1_dinode), 0, 0, 0, + "lfsdinopl", &pool_allocator_nointr, IPL_NONE); + pool_init(&lfs_inoext_pool, sizeof(struct lfs_inode_ext), 8, 0, 0, + "lfsinoextpl", &pool_allocator_nointr, IPL_NONE); + pool_init(&lfs_lbnentry_pool, sizeof(struct lbnentry), 0, 0, 0, + "lfslbnpool", &pool_allocator_nointr, IPL_NONE); + ufs_init(); + +#ifdef DEBUG + memset(lfs_log, 0, sizeof(lfs_log)); +#endif + mutex_init(&lfs_lock, MUTEX_DEFAULT, IPL_NONE); + cv_init(&locked_queue_cv, "lfsbuf"); + cv_init(&lfs_writing_cv, "lfsflush"); +} + +void +lfs_reinit(void) +{ + ufs_reinit(); +} + +void +lfs_done(void) +{ + ufs_done(); + mutex_destroy(&lfs_lock); + cv_destroy(&locked_queue_cv); + cv_destroy(&lfs_writing_cv); + pool_destroy(&lfs_inode_pool); + pool_destroy(&lfs_dinode_pool); + pool_destroy(&lfs_inoext_pool); + pool_destroy(&lfs_lbnentry_pool); + malloc_type_detach(M_SEGMENT); +} + +/* + * Called by main() when ufs is going to be mounted as root. + */ +int +lfs_mountroot(void) +{ + extern struct vnode *rootvp; + struct lfs *fs = NULL; /* LFS */ + struct mount *mp; + struct lwp *l = curlwp; + struct ufsmount *ump; + int error; + + if (device_class(root_device) != DV_DISK) + return (ENODEV); + + if (rootdev == NODEV) + return (ENODEV); + if ((error = vfs_rootmountalloc(MOUNT_LFS, "root_device", &mp))) { + vrele(rootvp); + return (error); + } + if ((error = lfs_mountfs(rootvp, mp, l))) { + vfs_unbusy(mp, false, NULL); + vfs_destroy(mp); + return (error); + } + mutex_enter(&mountlist_lock); + CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mutex_exit(&mountlist_lock); + ump = VFSTOUFS(mp); + fs = ump->um_lfs; + memset(fs->lfs_fsmnt, 0, sizeof(fs->lfs_fsmnt)); + (void)copystr(mp->mnt_stat.f_mntonname, fs->lfs_fsmnt, MNAMELEN - 1, 0); + (void)lfs_statvfs(mp, &mp->mnt_stat); + vfs_unbusy(mp, false, NULL); + setrootfstime((time_t)(VFSTOUFS(mp)->um_lfs->lfs_tstamp)); + return (0); +} + +/* + * VFS Operations. + * + * mount system call + */ +int +lfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) +{ + struct lwp *l = curlwp; + struct vnode *devvp; + struct ufs_args *args = data; + struct ufsmount *ump = NULL; + struct lfs *fs = NULL; /* LFS */ + int error = 0, update; + mode_t accessmode; + + if (*data_len < sizeof *args) + return EINVAL; + + if (mp->mnt_flag & MNT_GETARGS) { + ump = VFSTOUFS(mp); + if (ump == NULL) + return EIO; + args->fspec = NULL; + *data_len = sizeof *args; + return 0; + } + + update = mp->mnt_flag & MNT_UPDATE; + + /* Check arguments */ + if (args->fspec != NULL) { + /* + * Look up the name and verify that it's sane. + */ + error = namei_simple_user(args->fspec, + NSM_FOLLOW_NOEMULROOT, &devvp); + if (error != 0) + return (error); + + if (!update) { + /* + * Be sure this is a valid block device + */ + if (devvp->v_type != VBLK) + error = ENOTBLK; + else if (bdevsw_lookup(devvp->v_rdev) == NULL) + error = ENXIO; + } else { + /* + * Be sure we're still naming the same device + * used for our initial mount + */ + ump = VFSTOUFS(mp); + if (devvp != ump->um_devvp) { + if (devvp->v_rdev != ump->um_devvp->v_rdev) + error = EINVAL; + else { + vrele(devvp); + devvp = ump->um_devvp; + vref(devvp); + } + } + } + } else { + if (!update) { + /* New mounts must have a filename for the device */ + return (EINVAL); + } else { + /* Use the extant mount */ + ump = VFSTOUFS(mp); + devvp = ump->um_devvp; + vref(devvp); + } + } + + + /* + * If mount by non-root, then verify that user has necessary + * permissions on the device. + */ + if (error == 0) { + accessmode = VREAD; + if (update ? + (mp->mnt_iflag & IMNT_WANTRDWR) != 0 : + (mp->mnt_flag & MNT_RDONLY) == 0) + accessmode |= VWRITE; + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = genfs_can_mount(devvp, accessmode, l->l_cred); + VOP_UNLOCK(devvp); + } + + if (error) { + vrele(devvp); + return (error); + } + + if (!update) { + int flags; + + if (mp->mnt_flag & MNT_RDONLY) + flags = FREAD; + else + flags = FREAD|FWRITE; + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_OPEN(devvp, flags, FSCRED); + VOP_UNLOCK(devvp); + if (error) + goto fail; + error = lfs_mountfs(devvp, mp, l); /* LFS */ + if (error) { + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + (void)VOP_CLOSE(devvp, flags, NOCRED); + VOP_UNLOCK(devvp); + goto fail; + } + + ump = VFSTOUFS(mp); + fs = ump->um_lfs; + } else { + /* + * Update the mount. + */ + + /* + * The initial mount got a reference on this + * device, so drop the one obtained via + * namei(), above. + */ + vrele(devvp); + + ump = VFSTOUFS(mp); + fs = ump->um_lfs; + if (fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) { + /* + * Changing from read-only to read/write. + * Note in the superblocks that we're writing. + */ + fs->lfs_ronly = 0; + if (fs->lfs_pflags & LFS_PF_CLEAN) { + fs->lfs_pflags &= ~LFS_PF_CLEAN; + lfs_writesuper(fs, fs->lfs_sboffs[0]); + lfs_writesuper(fs, fs->lfs_sboffs[1]); + } + } + if (args->fspec == NULL) + return EINVAL; + } + + error = set_statvfs_info(path, UIO_USERSPACE, args->fspec, + UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); + if (error == 0) + (void)strncpy(fs->lfs_fsmnt, mp->mnt_stat.f_mntonname, + sizeof(fs->lfs_fsmnt)); + return error; + +fail: + vrele(devvp); + return (error); +} + + +/* + * Common code for mount and mountroot + * LFS specific + */ +int +lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) +{ + struct dlfs *tdfs, *dfs, *adfs; + struct lfs *fs; + struct ufsmount *ump; + struct vnode *vp; + struct buf *bp, *abp; + dev_t dev; + int error, i, ronly, fsbsize; + kauth_cred_t cred; + CLEANERINFO *cip; + SEGUSE *sup; + daddr_t sb_addr; + + cred = l ? l->l_cred : NOCRED; + + /* + * Flush out any old buffers remaining from a previous use. + */ + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0); + VOP_UNLOCK(devvp); + if (error) + return (error); + + ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + + /* Don't free random space on error. */ + bp = NULL; + abp = NULL; + ump = NULL; + + sb_addr = LFS_LABELPAD / DEV_BSIZE; + while (1) { + /* Read in the superblock. */ + error = bread(devvp, sb_addr, LFS_SBPAD, cred, 0, &bp); + if (error) + goto out; + dfs = (struct dlfs *)bp->b_data; + + /* Check the basics. */ + if (dfs->dlfs_magic != LFS_MAGIC || dfs->dlfs_bsize > MAXBSIZE || + dfs->dlfs_version > LFS_VERSION || + dfs->dlfs_bsize < sizeof(struct dlfs)) { + DLOG((DLOG_MOUNT, "lfs_mountfs: primary superblock sanity failed\n")); + error = EINVAL; /* XXX needs translation */ + goto out; + } + if (dfs->dlfs_inodefmt > LFS_MAXINODEFMT) { + DLOG((DLOG_MOUNT, "lfs_mountfs: unknown inode format %d\n", + dfs->dlfs_inodefmt)); + error = EINVAL; + goto out; + } + + if (dfs->dlfs_version == 1) + fsbsize = DEV_BSIZE; + else { + fsbsize = 1 << dfs->dlfs_ffshift; + /* + * Could be, if the frag size is large enough, that we + * don't have the "real" primary superblock. If that's + * the case, get the real one, and try again. + */ + if (sb_addr != (dfs->dlfs_sboffs[0] << (dfs->dlfs_ffshift - DEV_BSHIFT))) { + DLOG((DLOG_MOUNT, "lfs_mountfs: sb daddr" + " 0x%llx is not right, trying 0x%llx\n", + (long long)sb_addr, + (long long)(dfs->dlfs_sboffs[0] << (dfs->dlfs_ffshift - DEV_BSHIFT)))); + sb_addr = dfs->dlfs_sboffs[0] << (dfs->dlfs_ffshift - DEV_BSHIFT); + brelse(bp, 0); + continue; + } + } + break; + } + + /* + * Check the second superblock to see which is newer; then mount + * using the older of the two. This is necessary to ensure that + * the filesystem is valid if it was not unmounted cleanly. + */ + + if (dfs->dlfs_sboffs[1] && + dfs->dlfs_sboffs[1] - LFS_LABELPAD / fsbsize > LFS_SBPAD / fsbsize) + { + error = bread(devvp, dfs->dlfs_sboffs[1] * (fsbsize / DEV_BSIZE), + LFS_SBPAD, cred, 0, &abp); + if (error) + goto out; + adfs = (struct dlfs *)abp->b_data; + + if (dfs->dlfs_version == 1) { + /* 1s resolution comparison */ + if (adfs->dlfs_tstamp < dfs->dlfs_tstamp) + tdfs = adfs; + else + tdfs = dfs; + } else { + /* monotonic infinite-resolution comparison */ + if (adfs->dlfs_serial < dfs->dlfs_serial) + tdfs = adfs; + else + tdfs = dfs; + } + + /* Check the basics. */ + if (tdfs->dlfs_magic != LFS_MAGIC || + tdfs->dlfs_bsize > MAXBSIZE || + tdfs->dlfs_version > LFS_VERSION || + tdfs->dlfs_bsize < sizeof(struct dlfs)) { + DLOG((DLOG_MOUNT, "lfs_mountfs: alt superblock" + " sanity failed\n")); + error = EINVAL; /* XXX needs translation */ + goto out; + } + } else { + DLOG((DLOG_MOUNT, "lfs_mountfs: invalid alt superblock" + " daddr=0x%x\n", dfs->dlfs_sboffs[1])); + error = EINVAL; + goto out; + } + + /* Allocate the mount structure, copy the superblock into it. */ + fs = malloc(sizeof(struct lfs), M_UFSMNT, M_WAITOK | M_ZERO); + memcpy(&fs->lfs_dlfs, tdfs, sizeof(struct dlfs)); + + /* Compatibility */ + if (fs->lfs_version < 2) { + fs->lfs_sumsize = LFS_V1_SUMMARY_SIZE; + fs->lfs_ibsize = fs->lfs_bsize; + fs->lfs_start = fs->lfs_sboffs[0]; + fs->lfs_tstamp = fs->lfs_otstamp; + fs->lfs_fsbtodb = 0; + } + if (fs->lfs_resvseg == 0) + fs->lfs_resvseg = MIN(fs->lfs_minfreeseg - 1, \ + MAX(MIN_RESV_SEGS, fs->lfs_minfreeseg / 2 + 1)); + + /* + * If we aren't going to be able to write meaningfully to this + * filesystem, and were not mounted readonly, bomb out now. + */ + if (fsbtob(fs, LFS_NRESERVE(fs)) > LFS_MAX_BYTES && !ronly) { + DLOG((DLOG_MOUNT, "lfs_mount: to mount this filesystem read/write," + " we need BUFPAGES >= %lld\n", + (long long)((bufmem_hiwater / bufmem_lowater) * + LFS_INVERSE_MAX_BYTES( + fsbtob(fs, LFS_NRESERVE(fs))) >> PAGE_SHIFT))); + free(fs, M_UFSMNT); + error = EFBIG; /* XXX needs translation */ + goto out; + } + + /* Before rolling forward, lock so vget will sleep for other procs */ + if (l != NULL) { + fs->lfs_flags = LFS_NOTYET; + fs->lfs_rfpid = l->l_proc->p_pid; + } + + ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); + ump->um_lfs = fs; + ump->um_ops = &lfs_ufsops; + ump->um_fstype = UFS1; + if (sizeof(struct lfs) < LFS_SBPAD) { /* XXX why? */ + brelse(bp, BC_INVAL); + brelse(abp, BC_INVAL); + } else { + brelse(bp, 0); + brelse(abp, 0); + } + bp = NULL; + abp = NULL; + + + /* Set up the I/O information */ + fs->lfs_devbsize = DEV_BSIZE; + fs->lfs_iocount = 0; + fs->lfs_diropwait = 0; + fs->lfs_activesb = 0; + fs->lfs_uinodes = 0; + fs->lfs_ravail = 0; + fs->lfs_favail = 0; + fs->lfs_sbactive = 0; + + /* Set up the ifile and lock aflags */ + fs->lfs_doifile = 0; + fs->lfs_writer = 0; + fs->lfs_dirops = 0; + fs->lfs_nadirop = 0; + fs->lfs_seglock = 0; + fs->lfs_pdflush = 0; + fs->lfs_sleepers = 0; + fs->lfs_pages = 0; + rw_init(&fs->lfs_fraglock); + rw_init(&fs->lfs_iflock); + cv_init(&fs->lfs_stopcv, "lfsstop"); + + /* Set the file system readonly/modify bits. */ + fs->lfs_ronly = ronly; + if (ronly == 0) + fs->lfs_fmod = 1; + + /* Initialize the mount structure. */ + dev = devvp->v_rdev; + mp->mnt_data = ump; + mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev; + mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_LFS); + mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; + mp->mnt_stat.f_namemax = LFS_MAXNAMLEN; + mp->mnt_stat.f_iosize = fs->lfs_bsize; + mp->mnt_flag |= MNT_LOCAL; + mp->mnt_fs_bshift = fs->lfs_bshift; + ump->um_flags = 0; + ump->um_mountp = mp; + ump->um_dev = dev; + ump->um_devvp = devvp; + ump->um_bptrtodb = fs->lfs_ffshift - DEV_BSHIFT; + ump->um_seqinc = fs->lfs_frag; + ump->um_nindir = fs->lfs_nindir; + ump->um_lognindir = ffs(fs->lfs_nindir) - 1; + for (i = 0; i < MAXQUOTAS; i++) + ump->um_quotas[i] = NULLVP; + ump->um_maxsymlinklen = fs->lfs_maxsymlinklen; + ump->um_dirblksiz = DIRBLKSIZ; + ump->um_maxfilesize = fs->lfs_maxfilesize; + if (ump->um_maxsymlinklen > 0) + mp->mnt_iflag |= IMNT_DTYPE; + devvp->v_specmountpoint = mp; + + /* Set up reserved memory for pageout */ + lfs_setup_resblks(fs); + /* Set up vdirop tailq */ + TAILQ_INIT(&fs->lfs_dchainhd); + /* and paging tailq */ + TAILQ_INIT(&fs->lfs_pchainhd); + /* and delayed segment accounting for truncation list */ + LIST_INIT(&fs->lfs_segdhd); + + /* + * We use the ifile vnode for almost every operation. Instead of + * retrieving it from the hash table each time we retrieve it here, + * artificially increment the reference count and keep a pointer + * to it in the incore copy of the superblock. + */ + if ((error = VFS_VGET(mp, LFS_IFILE_INUM, &vp)) != 0) { + DLOG((DLOG_MOUNT, "lfs_mountfs: ifile vget failed, error=%d\n", error)); + goto out; + } + fs->lfs_ivnode = vp; + vref(vp); + + /* Set up inode bitmap and order free list */ + lfs_order_freelist(fs); + + /* Set up segment usage flags for the autocleaner. */ + fs->lfs_nactive = 0; + fs->lfs_suflags = (u_int32_t **)malloc(2 * sizeof(u_int32_t *), + M_SEGMENT, M_WAITOK); + fs->lfs_suflags[0] = (u_int32_t *)malloc(fs->lfs_nseg * sizeof(u_int32_t), + M_SEGMENT, M_WAITOK); + fs->lfs_suflags[1] = (u_int32_t *)malloc(fs->lfs_nseg * sizeof(u_int32_t), + M_SEGMENT, M_WAITOK); + memset(fs->lfs_suflags[1], 0, fs->lfs_nseg * sizeof(u_int32_t)); + for (i = 0; i < fs->lfs_nseg; i++) { + int changed; + + LFS_SEGENTRY(sup, fs, i, bp); + changed = 0; + if (!ronly) { + if (sup->su_nbytes == 0 && + !(sup->su_flags & SEGUSE_EMPTY)) { + sup->su_flags |= SEGUSE_EMPTY; + ++changed; + } else if (!(sup->su_nbytes == 0) && + (sup->su_flags & SEGUSE_EMPTY)) { + sup->su_flags &= ~SEGUSE_EMPTY; + ++changed; + } + if (sup->su_flags & (SEGUSE_ACTIVE|SEGUSE_INVAL)) { + sup->su_flags &= ~(SEGUSE_ACTIVE|SEGUSE_INVAL); + ++changed; + } + } + fs->lfs_suflags[0][i] = sup->su_flags; + if (changed) + LFS_WRITESEGENTRY(sup, fs, i, bp); + else + brelse(bp, 0); + } + +#ifdef LFS_KERNEL_RFW + lfs_roll_forward(fs, mp, l); +#endif + + /* If writing, sb is not clean; record in case of immediate crash */ + if (!fs->lfs_ronly) { + fs->lfs_pflags &= ~LFS_PF_CLEAN; + lfs_writesuper(fs, fs->lfs_sboffs[0]); + lfs_writesuper(fs, fs->lfs_sboffs[1]); + } + + /* Allow vget now that roll-forward is complete */ + fs->lfs_flags &= ~(LFS_NOTYET); + wakeup(&fs->lfs_flags); + + /* + * Initialize the ifile cleaner info with information from + * the superblock. + */ + LFS_CLEANERINFO(cip, fs, bp); + cip->clean = fs->lfs_nclean; + cip->dirty = fs->lfs_nseg - fs->lfs_nclean; + cip->avail = fs->lfs_avail; + cip->bfree = fs->lfs_bfree; + (void) LFS_BWRITE_LOG(bp); /* Ifile */ + + /* + * Mark the current segment as ACTIVE, since we're going to + * be writing to it. + */ + LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp); + sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; + fs->lfs_nactive++; + LFS_WRITESEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp); /* Ifile */ + + /* Now that roll-forward is done, unlock the Ifile */ + vput(vp); + + /* Start the pagedaemon-anticipating daemon */ + if (lfs_writer_daemon == 0 && kthread_create(PRI_BIO, 0, NULL, + lfs_writerd, NULL, NULL, "lfs_writer") != 0) + panic("fork lfs_writer"); + /* + * XXX: Get extra reference to LFS vfsops. This prevents unload, + * but also prevents kernel panic due to text being unloaded + * from below lfs_writerd. When lfs_writerd can exit, remove + * this!!! + */ + vfs_getopsbyname(MOUNT_LFS); + + printf("WARNING: the log-structured file system is experimental\n" + "WARNING: it may cause system crashes and/or corrupt data\n"); + + return (0); + +out: + if (bp) + brelse(bp, 0); + if (abp) + brelse(abp, 0); + if (ump) { + free(ump->um_lfs, M_UFSMNT); + free(ump, M_UFSMNT); + mp->mnt_data = NULL; + } + + return (error); +} + +/* + * unmount system call + */ +int +lfs_unmount(struct mount *mp, int mntflags) +{ + struct lwp *l = curlwp; + struct ufsmount *ump; + struct lfs *fs; + int error, flags, ronly; + vnode_t *vp; + + flags = 0; + if (mntflags & MNT_FORCE) + flags |= FORCECLOSE; + + ump = VFSTOUFS(mp); + fs = ump->um_lfs; + + /* Two checkpoints */ + lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); + lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); + + /* wake up the cleaner so it can die */ + lfs_wakeup_cleaner(fs); + mutex_enter(&lfs_lock); + while (fs->lfs_sleepers) + mtsleep(&fs->lfs_sleepers, PRIBIO + 1, "lfs_sleepers", 0, + &lfs_lock); + mutex_exit(&lfs_lock); + +#ifdef QUOTA + if ((error = quota1_umount(mp, flags)) != 0) + return (error); +#endif + if ((error = vflush(mp, fs->lfs_ivnode, flags)) != 0) + return (error); + if ((error = VFS_SYNC(mp, 1, l->l_cred)) != 0) + return (error); + vp = fs->lfs_ivnode; + mutex_enter(vp->v_interlock); + if (LIST_FIRST(&vp->v_dirtyblkhd)) + panic("lfs_unmount: still dirty blocks on ifile vnode"); + mutex_exit(vp->v_interlock); + + /* Explicitly write the superblock, to update serial and pflags */ + fs->lfs_pflags |= LFS_PF_CLEAN; + lfs_writesuper(fs, fs->lfs_sboffs[0]); + lfs_writesuper(fs, fs->lfs_sboffs[1]); + mutex_enter(&lfs_lock); + while (fs->lfs_iocount) + mtsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs_umount", 0, + &lfs_lock); + mutex_exit(&lfs_lock); + + /* Finish with the Ifile, now that we're done with it */ + vgone(fs->lfs_ivnode); + + ronly = !fs->lfs_ronly; + if (ump->um_devvp->v_type != VBAD) + ump->um_devvp->v_specmountpoint = NULL; + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_CLOSE(ump->um_devvp, + ronly ? FREAD : FREAD|FWRITE, NOCRED); + vput(ump->um_devvp); + + /* Complain about page leakage */ + if (fs->lfs_pages > 0) + printf("lfs_unmount: still claim %d pages (%d in subsystem)\n", + fs->lfs_pages, lfs_subsys_pages); + + /* Free per-mount data structures */ + free(fs->lfs_ino_bitmap, M_SEGMENT); + free(fs->lfs_suflags[0], M_SEGMENT); + free(fs->lfs_suflags[1], M_SEGMENT); + free(fs->lfs_suflags, M_SEGMENT); + lfs_free_resblks(fs); + cv_destroy(&fs->lfs_stopcv); + rw_destroy(&fs->lfs_fraglock); + rw_destroy(&fs->lfs_iflock); + free(fs, M_UFSMNT); + free(ump, M_UFSMNT); + + mp->mnt_data = NULL; + mp->mnt_flag &= ~MNT_LOCAL; + return (error); +} + +/* + * Get file system statistics. + * + * NB: We don't lock to access the superblock here, because it's not + * really that important if we get it wrong. + */ +int +lfs_statvfs(struct mount *mp, struct statvfs *sbp) +{ + struct lfs *fs; + struct ufsmount *ump; + + ump = VFSTOUFS(mp); + fs = ump->um_lfs; + if (fs->lfs_magic != LFS_MAGIC) + panic("lfs_statvfs: magic"); + + sbp->f_bsize = fs->lfs_bsize; + sbp->f_frsize = fs->lfs_fsize; + sbp->f_iosize = fs->lfs_bsize; + sbp->f_blocks = LFS_EST_NONMETA(fs) - VTOI(fs->lfs_ivnode)->i_lfs_effnblks; + + sbp->f_bfree = LFS_EST_BFREE(fs); + KASSERT(sbp->f_bfree <= fs->lfs_dsize); +#if 0 + if (sbp->f_bfree < 0) + sbp->f_bfree = 0; +#endif + + sbp->f_bresvd = LFS_EST_RSVD(fs); + if (sbp->f_bfree > sbp->f_bresvd) + sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd; + else + sbp->f_bavail = 0; + + sbp->f_files = fs->lfs_bfree / btofsb(fs, fs->lfs_ibsize) * INOPB(fs); + sbp->f_ffree = sbp->f_files - fs->lfs_nfiles; + sbp->f_favail = sbp->f_ffree; + sbp->f_fresvd = 0; + copy_statvfs_info(sbp, mp); + return (0); +} + +/* + * Go through the disk queues to initiate sandbagged IO; + * go through the inodes to write those that have been modified; + * initiate the writing of the super block if it has been modified. + * + * Note: we are always called with the filesystem marked `MPBUSY'. + */ +int +lfs_sync(struct mount *mp, int waitfor, kauth_cred_t cred) +{ + int error; + struct lfs *fs; + + fs = VFSTOUFS(mp)->um_lfs; + if (fs->lfs_ronly) + return 0; + + /* Snapshots should not hose the syncer */ + /* + * XXX Sync can block here anyway, since we don't have a very + * XXX good idea of how much data is pending. If it's more + * XXX than a segment and lfs_nextseg is close to the end of + * XXX the log, we'll likely block. + */ + mutex_enter(&lfs_lock); + if (fs->lfs_nowrap && fs->lfs_nextseg < fs->lfs_curseg) { + mutex_exit(&lfs_lock); + return 0; + } + mutex_exit(&lfs_lock); + + lfs_writer_enter(fs, "lfs_dirops"); + + /* All syncs must be checkpoints until roll-forward is implemented. */ + DLOG((DLOG_FLUSH, "lfs_sync at 0x%x\n", fs->lfs_offset)); + error = lfs_segwrite(mp, SEGM_CKP | (waitfor ? SEGM_SYNC : 0)); + lfs_writer_leave(fs); +#ifdef QUOTA + qsync(mp); +#endif + return (error); +} + +/* + * Look up an LFS dinode number to find its incore vnode. If not already + * in core, read it in from the specified device. Return the inode locked. + * Detection and handling of mount points must be done by the calling routine. + */ +int +lfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp) +{ + struct lfs *fs; + struct ufs1_dinode *dip; + struct inode *ip; + struct buf *bp; + struct ifile *ifp; + struct vnode *vp; + struct ufsmount *ump; + daddr_t daddr; + dev_t dev; + int error, retries; + struct timespec ts; + + memset(&ts, 0, sizeof ts); /* XXX gcc */ + + ump = VFSTOUFS(mp); + dev = ump->um_dev; + fs = ump->um_lfs; + + /* + * If the filesystem is not completely mounted yet, suspend + * any access requests (wait for roll-forward to complete). + */ + mutex_enter(&lfs_lock); + while ((fs->lfs_flags & LFS_NOTYET) && curproc->p_pid != fs->lfs_rfpid) + mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_notyet", 0, + &lfs_lock); + mutex_exit(&lfs_lock); + +retry: + if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL) + return (0); + + error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, NULL, &vp); + if (error) { + *vpp = NULL; + return (error); + } + + mutex_enter(&ufs_hashlock); + if (ufs_ihashget(dev, ino, 0) != NULL) { + mutex_exit(&ufs_hashlock); + ungetnewvnode(vp); + goto retry; + } + + /* Translate the inode number to a disk address. */ + if (ino == LFS_IFILE_INUM) + daddr = fs->lfs_idaddr; + else { + /* XXX bounds-check this too */ + LFS_IENTRY(ifp, fs, ino, bp); + daddr = ifp->if_daddr; + if (fs->lfs_version > 1) { + ts.tv_sec = ifp->if_atime_sec; + ts.tv_nsec = ifp->if_atime_nsec; + } + + brelse(bp, 0); + if (daddr == LFS_UNUSED_DADDR) { + *vpp = NULLVP; + mutex_exit(&ufs_hashlock); + ungetnewvnode(vp); + return (ENOENT); + } + } + + /* Allocate/init new vnode/inode. */ + lfs_vcreate(mp, ino, vp); + + /* + * Put it onto its hash chain and lock it so that other requests for + * this inode will block if they arrive while we are sleeping waiting + * for old data structures to be purged or for the contents of the + * disk portion of this inode to be read. + */ + ip = VTOI(vp); + ufs_ihashins(ip); + mutex_exit(&ufs_hashlock); + + /* + * XXX + * This may not need to be here, logically it should go down with + * the i_devvp initialization. + * Ask Kirk. + */ + ip->i_lfs = ump->um_lfs; + + /* Read in the disk contents for the inode, copy into the inode. */ + retries = 0; + again: + error = bread(ump->um_devvp, fsbtodb(fs, daddr), + (fs->lfs_version == 1 ? fs->lfs_bsize : fs->lfs_ibsize), + NOCRED, 0, &bp); + if (error) { + /* + * The inode does not contain anything useful, so it would + * be misleading to leave it on its hash chain. With mode + * still zero, it will be unlinked and returned to the free + * list by vput(). + */ + vput(vp); + brelse(bp, 0); + *vpp = NULL; + return (error); + } + + dip = lfs_ifind(fs, ino, bp); + if (dip == NULL) { + /* Assume write has not completed yet; try again */ + brelse(bp, BC_INVAL); + ++retries; + if (retries > LFS_IFIND_RETRIES) { +#ifdef DEBUG + /* If the seglock is held look at the bpp to see + what is there anyway */ + mutex_enter(&lfs_lock); + if (fs->lfs_seglock > 0) { + struct buf **bpp; + struct ufs1_dinode *dp; + int i; + + for (bpp = fs->lfs_sp->bpp; + bpp != fs->lfs_sp->cbpp; ++bpp) { + if ((*bpp)->b_vp == fs->lfs_ivnode && + bpp != fs->lfs_sp->bpp) { + /* Inode block */ + printf("lfs_vget: block 0x%" PRIx64 ": ", + (*bpp)->b_blkno); + dp = (struct ufs1_dinode *)(*bpp)->b_data; + for (i = 0; i < INOPB(fs); i++) + if (dp[i].di_u.inumber) + printf("%d ", dp[i].di_u.inumber); + printf("\n"); + } + } + } + mutex_exit(&lfs_lock); +#endif /* DEBUG */ + panic("lfs_vget: dinode not found"); + } + mutex_enter(&lfs_lock); + if (fs->lfs_iocount) { + DLOG((DLOG_VNODE, "lfs_vget: dinode %d not found, retrying...\n", ino)); + (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1, + "lfs ifind", 1, &lfs_lock); + } else + retries = LFS_IFIND_RETRIES; + mutex_exit(&lfs_lock); + goto again; + } + *ip->i_din.ffs1_din = *dip; + brelse(bp, 0); + + if (fs->lfs_version > 1) { + ip->i_ffs1_atime = ts.tv_sec; + ip->i_ffs1_atimensec = ts.tv_nsec; + } + + lfs_vinit(mp, &vp); + + *vpp = vp; + + KASSERT(VOP_ISLOCKED(vp)); + + return (0); +} + +/* + * File handle to vnode + */ +int +lfs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp) +{ + struct lfid lfh; + struct buf *bp; + IFILE *ifp; + int32_t daddr; + struct lfs *fs; + vnode_t *vp; + + if (fhp->fid_len != sizeof(struct lfid)) + return EINVAL; + + memcpy(&lfh, fhp, sizeof(lfh)); + if (lfh.lfid_ino < LFS_IFILE_INUM) + return ESTALE; + + fs = VFSTOUFS(mp)->um_lfs; + if (lfh.lfid_ident != fs->lfs_ident) + return ESTALE; + + if (lfh.lfid_ino > + ((VTOI(fs->lfs_ivnode)->i_ffs1_size >> fs->lfs_bshift) - + fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb) + return ESTALE; + + mutex_enter(&ufs_ihash_lock); + vp = ufs_ihashlookup(VFSTOUFS(mp)->um_dev, lfh.lfid_ino); + mutex_exit(&ufs_ihash_lock); + if (vp == NULL) { + LFS_IENTRY(ifp, fs, lfh.lfid_ino, bp); + daddr = ifp->if_daddr; + brelse(bp, 0); + if (daddr == LFS_UNUSED_DADDR) + return ESTALE; + } + + return (ufs_fhtovp(mp, &lfh.lfid_ufid, vpp)); +} + +/* + * Vnode pointer to File handle + */ +/* ARGSUSED */ +int +lfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size) +{ + struct inode *ip; + struct lfid lfh; + + if (*fh_size < sizeof(struct lfid)) { + *fh_size = sizeof(struct lfid); + return E2BIG; + } + *fh_size = sizeof(struct lfid); + ip = VTOI(vp); + memset(&lfh, 0, sizeof(lfh)); + lfh.lfid_len = sizeof(struct lfid); + lfh.lfid_ino = ip->i_number; + lfh.lfid_gen = ip->i_gen; + lfh.lfid_ident = ip->i_lfs->lfs_ident; + memcpy(fhp, &lfh, sizeof(lfh)); + return (0); +} + +/* + * ufs_bmaparray callback function for writing. + * + * Since blocks will be written to the new segment anyway, + * we don't care about current daddr of them. + */ +static bool +lfs_issequential_hole(const struct ufsmount *ump, + daddr_t daddr0, daddr_t daddr1) +{ + daddr0 = (daddr_t)((int32_t)daddr0); /* XXX ondisk32 */ + daddr1 = (daddr_t)((int32_t)daddr1); /* XXX ondisk32 */ + + KASSERT(daddr0 == UNWRITTEN || + (0 <= daddr0 && daddr0 <= LFS_MAX_DADDR)); + KASSERT(daddr1 == UNWRITTEN || + (0 <= daddr1 && daddr1 <= LFS_MAX_DADDR)); + + /* NOTE: all we want to know here is 'hole or not'. */ + /* NOTE: UNASSIGNED is converted to 0 by ufs_bmaparray. */ + + /* + * treat UNWRITTENs and all resident blocks as 'contiguous' + */ + if (daddr0 != 0 && daddr1 != 0) + return true; + + /* + * both are in hole? + */ + if (daddr0 == 0 && daddr1 == 0) + return true; /* all holes are 'contiguous' for us. */ + + return false; +} + +/* + * lfs_gop_write functions exactly like genfs_gop_write, except that + * (1) it requires the seglock to be held by its caller, and sp->fip + * to be properly initialized (it will return without re-initializing + * sp->fip, and without calling lfs_writeseg). + * (2) it uses the remaining space in the segment, rather than VOP_BMAP, + * to determine how large a block it can write at once (though it does + * still use VOP_BMAP to find holes in the file); + * (3) it calls lfs_gatherblock instead of VOP_STRATEGY on its blocks + * (leaving lfs_writeseg to deal with the cluster blocks, so we might + * now have clusters of clusters, ick.) + */ +static int +lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, + int flags) +{ + int i, error, run, haveeof = 0; + int fs_bshift; + vaddr_t kva; + off_t eof, offset, startoffset = 0; + size_t bytes, iobytes, skipbytes; + bool async = (flags & PGO_SYNCIO) == 0; + daddr_t lbn, blkno; + struct vm_page *pg; + struct buf *mbp, *bp; + struct vnode *devvp = VTOI(vp)->i_devvp; + struct inode *ip = VTOI(vp); + struct lfs *fs = ip->i_lfs; + struct segment *sp = fs->lfs_sp; + UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist); + + ASSERT_SEGLOCK(fs); + + /* The Ifile lives in the buffer cache */ + KASSERT(vp != fs->lfs_ivnode); + + /* + * We don't want to fill the disk before the cleaner has a chance + * to make room for us. If we're in danger of doing that, fail + * with EAGAIN. The caller will have to notice this, unlock + * so the cleaner can run, relock and try again. + * + * We must write everything, however, if our vnode is being + * reclaimed. + */ + if (LFS_STARVED_FOR_SEGS(fs) && vp != fs->lfs_flushvp) + goto tryagain; + + /* + * Sometimes things slip past the filters in lfs_putpages, + * and the pagedaemon tries to write pages---problem is + * that the pagedaemon never acquires the segment lock. + * + * Alternatively, pages that were clean when we called + * genfs_putpages may have become dirty in the meantime. In this + * case the segment header is not properly set up for blocks + * to be added to it. + * + * Unbusy and unclean the pages, and put them on the ACTIVE + * queue under the hypothesis that they couldn't have got here + * unless they were modified *quite* recently. + * + * XXXUBC that last statement is an oversimplification of course. + */ + if (!LFS_SEGLOCK_HELD(fs) || + (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) || + (pgs[0]->offset & fs->lfs_bmask) != 0) { + goto tryagain; + } + + UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x", + vp, pgs, npages, flags); + + GOP_SIZE(vp, vp->v_size, &eof, 0); + haveeof = 1; + + if (vp->v_type == VREG) + fs_bshift = vp->v_mount->mnt_fs_bshift; + else + fs_bshift = DEV_BSHIFT; + error = 0; + pg = pgs[0]; + startoffset = pg->offset; + KASSERT(eof >= 0); + + if (startoffset >= eof) { + goto tryagain; + } else + bytes = MIN(npages << PAGE_SHIFT, eof - startoffset); + skipbytes = 0; + + KASSERT(bytes != 0); + + /* Swap PG_DELWRI for PG_PAGEOUT */ + for (i = 0; i < npages; i++) { + if (pgs[i]->flags & PG_DELWRI) { + KASSERT(!(pgs[i]->flags & PG_PAGEOUT)); + pgs[i]->flags &= ~PG_DELWRI; + pgs[i]->flags |= PG_PAGEOUT; + uvm_pageout_start(1); + mutex_enter(&uvm_pageqlock); + uvm_pageunwire(pgs[i]); + mutex_exit(&uvm_pageqlock); + } + } + + /* + * Check to make sure we're starting on a block boundary. + * We'll check later to make sure we always write entire + * blocks (or fragments). + */ + if (startoffset & fs->lfs_bmask) + printf("%" PRId64 " & %" PRId64 " = %" PRId64 "\n", + startoffset, fs->lfs_bmask, + startoffset & fs->lfs_bmask); + KASSERT((startoffset & fs->lfs_bmask) == 0); + if (bytes & fs->lfs_ffmask) { + printf("lfs_gop_write: asked to write %ld bytes\n", (long)bytes); + panic("lfs_gop_write: non-integer blocks"); + } + + /* + * We could deadlock here on pager_map with UVMPAGER_MAPIN_WAITOK. + * If we would, write what we have and try again. If we don't + * have anything to write, we'll have to sleep. + */ + if ((kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE | + (((SEGSUM *)(sp->segsum))->ss_nfinfo < 1 ? + UVMPAGER_MAPIN_WAITOK : 0))) == 0x0) { + DLOG((DLOG_PAGE, "lfs_gop_write: forcing write\n")); +#if 0 + " with nfinfo=%d at offset 0x%x\n", + (int)((SEGSUM *)(sp->segsum))->ss_nfinfo, + (unsigned)fs->lfs_offset)); +#endif + lfs_updatemeta(sp); + lfs_release_finfo(fs); + (void) lfs_writeseg(fs, sp); + + lfs_acquire_finfo(fs, ip->i_number, ip->i_gen); + + /* + * Having given up all of the pager_map we were holding, + * we can now wait for aiodoned to reclaim it for us + * without fear of deadlock. + */ + kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE | + UVMPAGER_MAPIN_WAITOK); + } + + mbp = getiobuf(NULL, true); + UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x", + vp, mbp, vp->v_numoutput, bytes); + mbp->b_bufsize = npages << PAGE_SHIFT; + mbp->b_data = (void *)kva; + mbp->b_resid = mbp->b_bcount = bytes; + mbp->b_cflags = BC_BUSY|BC_AGE; + mbp->b_iodone = uvm_aio_biodone; + + bp = NULL; + for (offset = startoffset; + bytes > 0; + offset += iobytes, bytes -= iobytes) { + lbn = offset >> fs_bshift; + error = ufs_bmaparray(vp, lbn, &blkno, NULL, NULL, &run, + lfs_issequential_hole); + if (error) { + UVMHIST_LOG(ubchist, "ufs_bmaparray() -> %d", + error,0,0,0); + skipbytes += bytes; + bytes = 0; + break; + } + + iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset, + bytes); + if (blkno == (daddr_t)-1) { + skipbytes += iobytes; + continue; + } + + /* + * Discover how much we can really pack into this buffer. + */ + /* If no room in the current segment, finish it up */ + if (sp->sum_bytes_left < sizeof(int32_t) || + sp->seg_bytes_left < (1 << fs->lfs_bshift)) { + int vers; + + lfs_updatemeta(sp); + vers = sp->fip->fi_version; + lfs_release_finfo(fs); + (void) lfs_writeseg(fs, sp); + + lfs_acquire_finfo(fs, ip->i_number, vers); + } + /* Check both for space in segment and space in segsum */ + iobytes = MIN(iobytes, (sp->seg_bytes_left >> fs_bshift) + << fs_bshift); + iobytes = MIN(iobytes, (sp->sum_bytes_left / sizeof(int32_t)) + << fs_bshift); + KASSERT(iobytes > 0); + + /* if it's really one i/o, don't make a second buf */ + if (offset == startoffset && iobytes == bytes) { + bp = mbp; + /* + * All the LFS output is done by the segwriter. It + * will increment numoutput by one for all the bufs it + * recieves. However this buffer needs one extra to + * account for aiodone. + */ + mutex_enter(vp->v_interlock); + vp->v_numoutput++; + mutex_exit(vp->v_interlock); + } else { + bp = getiobuf(NULL, true); + UVMHIST_LOG(ubchist, "vp %p bp %p num now %d", + vp, bp, vp->v_numoutput, 0); + nestiobuf_setup(mbp, bp, offset - pg->offset, iobytes); + /* + * LFS doesn't like async I/O here, dies with + * and assert in lfs_bwrite(). Is that assert + * valid? I retained non-async behaviour when + * converted this to use nestiobuf --pooka + */ + bp->b_flags &= ~B_ASYNC; + } + + /* XXX This is silly ... is this necessary? */ + mutex_enter(&bufcache_lock); + mutex_enter(vp->v_interlock); + bgetvp(vp, bp); + mutex_exit(vp->v_interlock); + mutex_exit(&bufcache_lock); + + bp->b_lblkno = lblkno(fs, offset); + bp->b_private = mbp; + if (devvp->v_type == VBLK) { + bp->b_dev = devvp->v_rdev; + } + VOP_BWRITE(bp->b_vp, bp); + while (lfs_gatherblock(sp, bp, NULL)) + continue; + } + + nestiobuf_done(mbp, skipbytes, error); + if (skipbytes) { + UVMHIST_LOG(ubchist, "skipbytes %d", skipbytes, 0,0,0); + } + UVMHIST_LOG(ubchist, "returning 0", 0,0,0,0); + + if (!async) { + /* Start a segment write. */ + UVMHIST_LOG(ubchist, "flushing", 0,0,0,0); + mutex_enter(&lfs_lock); + lfs_flush(fs, 0, 1); + mutex_exit(&lfs_lock); + } + return (0); + + tryagain: + /* + * We can't write the pages, for whatever reason. + * Clean up after ourselves, and make the caller try again. + */ + mutex_enter(vp->v_interlock); + + /* Tell why we're here, if we know */ + if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) { + DLOG((DLOG_PAGE, "lfs_gop_write: clean pages dirtied\n")); + } else if ((pgs[0]->offset & fs->lfs_bmask) != 0) { + DLOG((DLOG_PAGE, "lfs_gop_write: not on block boundary\n")); + } else if (haveeof && startoffset >= eof) { + DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64 + " eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number, + pgs[0]->offset, eof, npages)); + } else if (LFS_STARVED_FOR_SEGS(fs)) { + DLOG((DLOG_PAGE, "lfs_gop_write: avail too low\n")); + } else { + DLOG((DLOG_PAGE, "lfs_gop_write: seglock not held\n")); + } + + mutex_enter(&uvm_pageqlock); + for (i = 0; i < npages; i++) { + pg = pgs[i]; + + if (pg->flags & PG_PAGEOUT) + uvm_pageout_done(1); + if (pg->flags & PG_DELWRI) { + uvm_pageunwire(pg); + } + uvm_pageactivate(pg); + pg->flags &= ~(PG_CLEAN|PG_DELWRI|PG_PAGEOUT|PG_RELEASED); + DLOG((DLOG_PAGE, "pg[%d] = %p (vp %p off %" PRIx64 ")\n", i, pg, + vp, pg->offset)); + DLOG((DLOG_PAGE, "pg[%d]->flags = %x\n", i, pg->flags)); + DLOG((DLOG_PAGE, "pg[%d]->pqflags = %x\n", i, pg->pqflags)); + DLOG((DLOG_PAGE, "pg[%d]->uanon = %p\n", i, pg->uanon)); + DLOG((DLOG_PAGE, "pg[%d]->uobject = %p\n", i, pg->uobject)); + DLOG((DLOG_PAGE, "pg[%d]->wire_count = %d\n", i, + pg->wire_count)); + DLOG((DLOG_PAGE, "pg[%d]->loan_count = %d\n", i, + pg->loan_count)); + } + /* uvm_pageunbusy takes care of PG_BUSY, PG_WANTED */ + uvm_page_unbusy(pgs, npages); + mutex_exit(&uvm_pageqlock); + mutex_exit(vp->v_interlock); + return EAGAIN; +} + +/* + * finish vnode/inode initialization. + * used by lfs_vget and lfs_fastvget. + */ +void +lfs_vinit(struct mount *mp, struct vnode **vpp) +{ + struct vnode *vp = *vpp; + struct inode *ip = VTOI(vp); + struct ufsmount *ump = VFSTOUFS(mp); + struct lfs *fs = ump->um_lfs; + int i; + + ip->i_mode = ip->i_ffs1_mode; + ip->i_nlink = ip->i_ffs1_nlink; + ip->i_lfs_osize = ip->i_size = ip->i_ffs1_size; + ip->i_flags = ip->i_ffs1_flags; + ip->i_gen = ip->i_ffs1_gen; + ip->i_uid = ip->i_ffs1_uid; + ip->i_gid = ip->i_ffs1_gid; + + ip->i_lfs_effnblks = ip->i_ffs1_blocks; + ip->i_lfs_odnlink = ip->i_ffs1_nlink; + + /* + * Initialize the vnode from the inode, check for aliases. In all + * cases re-init ip, the underlying vnode/inode may have changed. + */ + ufs_vinit(mp, lfs_specop_p, lfs_fifoop_p, &vp); + ip = VTOI(vp); + + memset(ip->i_lfs_fragsize, 0, NDADDR * sizeof(*ip->i_lfs_fragsize)); + if (vp->v_type != VLNK || ip->i_size >= ip->i_ump->um_maxsymlinklen) { +#ifdef DEBUG + for (i = (ip->i_size + fs->lfs_bsize - 1) >> fs->lfs_bshift; + i < NDADDR; i++) { + if ((vp->v_type == VBLK || vp->v_type == VCHR) && + i == 0) + continue; + if (ip->i_ffs1_db[i] != 0) { +inconsistent: + lfs_dump_dinode(ip->i_din.ffs1_din); + panic("inconsistent inode"); + } + } + for ( ; i < NDADDR + NIADDR; i++) { + if (ip->i_ffs1_ib[i - NDADDR] != 0) { + goto inconsistent; + } + } +#endif /* DEBUG */ + for (i = 0; i < NDADDR; i++) + if (ip->i_ffs1_db[i] != 0) + ip->i_lfs_fragsize[i] = blksize(fs, ip, i); + } + +#ifdef DIAGNOSTIC + if (vp->v_type == VNON) { +# ifdef DEBUG + lfs_dump_dinode(ip->i_din.ffs1_din); +# endif + panic("lfs_vinit: ino %llu is type VNON! (ifmt=%o)\n", + (unsigned long long)ip->i_number, + (ip->i_mode & IFMT) >> 12); + } +#endif /* DIAGNOSTIC */ + + /* + * Finish inode initialization now that aliasing has been resolved. + */ + + ip->i_devvp = ump->um_devvp; + vref(ip->i_devvp); + genfs_node_init(vp, &lfs_genfsops); + uvm_vnp_setsize(vp, ip->i_size); + + /* Initialize hiblk from file size */ + ip->i_lfs_hiblk = lblkno(ip->i_lfs, ip->i_size + ip->i_lfs->lfs_bsize - 1) - 1; + + *vpp = vp; +} + +/* + * Resize the filesystem to contain the specified number of segments. + */ +int +lfs_resize_fs(struct lfs *fs, int newnsegs) +{ + SEGUSE *sup; + struct buf *bp, *obp; + daddr_t olast, nlast, ilast, noff, start, end; + struct vnode *ivp; + struct inode *ip; + int error, badnews, inc, oldnsegs; + int sbbytes, csbbytes, gain, cgain; + int i; + + /* Only support v2 and up */ + if (fs->lfs_version < 2) + return EOPNOTSUPP; + + /* If we're doing nothing, do it fast */ + oldnsegs = fs->lfs_nseg; + if (newnsegs == oldnsegs) + return 0; + + /* We always have to have two superblocks */ + if (newnsegs <= dtosn(fs, fs->lfs_sboffs[1])) + return EFBIG; + + ivp = fs->lfs_ivnode; + ip = VTOI(ivp); + error = 0; + + /* Take the segment lock so no one else calls lfs_newseg() */ + lfs_seglock(fs, SEGM_PROT); + + /* + * Make sure the segments we're going to be losing, if any, + * are in fact empty. We hold the seglock, so their status + * cannot change underneath us. Count the superblocks we lose, + * while we're at it. + */ + sbbytes = csbbytes = 0; + cgain = 0; + for (i = newnsegs; i < oldnsegs; i++) { + LFS_SEGENTRY(sup, fs, i, bp); + badnews = sup->su_nbytes || !(sup->su_flags & SEGUSE_INVAL); + if (sup->su_flags & SEGUSE_SUPERBLOCK) + sbbytes += LFS_SBPAD; + if (!(sup->su_flags & SEGUSE_DIRTY)) { + ++cgain; + if (sup->su_flags & SEGUSE_SUPERBLOCK) + csbbytes += LFS_SBPAD; + } + brelse(bp, 0); + if (badnews) { + error = EBUSY; + goto out; + } + } + + /* Note old and new segment table endpoints, and old ifile size */ + olast = fs->lfs_cleansz + fs->lfs_segtabsz; + nlast = howmany(newnsegs, fs->lfs_sepb) + fs->lfs_cleansz; + ilast = ivp->v_size >> fs->lfs_bshift; + noff = nlast - olast; + + /* + * Make sure no one can use the Ifile while we change it around. + * Even after taking the iflock we need to make sure no one still + * is holding Ifile buffers, so we get each one, to drain them. + * (XXX this could be done better.) + */ + rw_enter(&fs->lfs_iflock, RW_WRITER); + vn_lock(ivp, LK_EXCLUSIVE | LK_RETRY); + for (i = 0; i < ilast; i++) { + bread(ivp, i, fs->lfs_bsize, NOCRED, 0, &bp); + brelse(bp, 0); + } + + /* Allocate new Ifile blocks */ + for (i = ilast; i < ilast + noff; i++) { + if (lfs_balloc(ivp, i * fs->lfs_bsize, fs->lfs_bsize, NOCRED, 0, + &bp) != 0) + panic("balloc extending ifile"); + memset(bp->b_data, 0, fs->lfs_bsize); + VOP_BWRITE(bp->b_vp, bp); + } + + /* Register new ifile size */ + ip->i_size += noff * fs->lfs_bsize; + ip->i_ffs1_size = ip->i_size; + uvm_vnp_setsize(ivp, ip->i_size); + + /* Copy the inode table to its new position */ + if (noff != 0) { + if (noff < 0) { + start = nlast; + end = ilast + noff; + inc = 1; + } else { + start = ilast + noff - 1; + end = nlast - 1; + inc = -1; + } + for (i = start; i != end; i += inc) { + if (bread(ivp, i, fs->lfs_bsize, NOCRED, + B_MODIFY, &bp) != 0) + panic("resize: bread dst blk failed"); + if (bread(ivp, i - noff, fs->lfs_bsize, + NOCRED, 0, &obp)) + panic("resize: bread src blk failed"); + memcpy(bp->b_data, obp->b_data, fs->lfs_bsize); + VOP_BWRITE(bp->b_vp, bp); + brelse(obp, 0); + } + } + + /* If we are expanding, write the new empty SEGUSE entries */ + if (newnsegs > oldnsegs) { + for (i = oldnsegs; i < newnsegs; i++) { + if ((error = bread(ivp, i / fs->lfs_sepb + + fs->lfs_cleansz, fs->lfs_bsize, + NOCRED, B_MODIFY, &bp)) != 0) + panic("lfs: ifile read: %d", error); + while ((i + 1) % fs->lfs_sepb && i < newnsegs) { + sup = &((SEGUSE *)bp->b_data)[i % fs->lfs_sepb]; + memset(sup, 0, sizeof(*sup)); + i++; + } + VOP_BWRITE(bp->b_vp, bp); + } + } + + /* Zero out unused superblock offsets */ + for (i = 2; i < LFS_MAXNUMSB; i++) + if (dtosn(fs, fs->lfs_sboffs[i]) >= newnsegs) + fs->lfs_sboffs[i] = 0x0; + + /* + * Correct superblock entries that depend on fs size. + * The computations of these are as follows: + * + * size = segtod(fs, nseg) + * dsize = segtod(fs, nseg - minfreeseg) - btofsb(#super * LFS_SBPAD) + * bfree = dsize - btofsb(fs, bsize * nseg / 2) - blocks_actually_used + * avail = segtod(fs, nclean) - btofsb(#clean_super * LFS_SBPAD) + * + (segtod(fs, 1) - (offset - curseg)) + * - segtod(fs, minfreeseg - (minfreeseg / 2)) + * + * XXX - we should probably adjust minfreeseg as well. + */ + gain = (newnsegs - oldnsegs); + fs->lfs_nseg = newnsegs; + fs->lfs_segtabsz = nlast - fs->lfs_cleansz; + fs->lfs_size += gain * btofsb(fs, fs->lfs_ssize); + fs->lfs_dsize += gain * btofsb(fs, fs->lfs_ssize) - btofsb(fs, sbbytes); + fs->lfs_bfree += gain * btofsb(fs, fs->lfs_ssize) - btofsb(fs, sbbytes) + - gain * btofsb(fs, fs->lfs_bsize / 2); + if (gain > 0) { + fs->lfs_nclean += gain; + fs->lfs_avail += gain * btofsb(fs, fs->lfs_ssize); + } else { + fs->lfs_nclean -= cgain; + fs->lfs_avail -= cgain * btofsb(fs, fs->lfs_ssize) - + btofsb(fs, csbbytes); + } + + /* Resize segment flag cache */ + fs->lfs_suflags[0] = (u_int32_t *)realloc(fs->lfs_suflags[0], + fs->lfs_nseg * sizeof(u_int32_t), + M_SEGMENT, M_WAITOK); + fs->lfs_suflags[1] = (u_int32_t *)realloc(fs->lfs_suflags[1], + fs->lfs_nseg * sizeof(u_int32_t), + M_SEGMENT, M_WAITOK); + for (i = oldnsegs; i < newnsegs; i++) + fs->lfs_suflags[0][i] = fs->lfs_suflags[1][i] = 0x0; + + /* Truncate Ifile if necessary */ + if (noff < 0) + lfs_truncate(ivp, ivp->v_size + (noff << fs->lfs_bshift), 0, + NOCRED); + + /* Update cleaner info so the cleaner can die */ + bread(ivp, 0, fs->lfs_bsize, NOCRED, B_MODIFY, &bp); + ((CLEANERINFO *)bp->b_data)->clean = fs->lfs_nclean; + ((CLEANERINFO *)bp->b_data)->dirty = fs->lfs_nseg - fs->lfs_nclean; + VOP_BWRITE(bp->b_vp, bp); + + /* Let Ifile accesses proceed */ + VOP_UNLOCK(ivp); + rw_exit(&fs->lfs_iflock); + + out: + lfs_segunlock(fs); + return error; +} diff --git a/sys/ufs/lfs/lfs_vnops.c b/sys/ufs/lfs/lfs_vnops.c new file mode 100644 index 000000000..f30a5d20c --- /dev/null +++ b/sys/ufs/lfs/lfs_vnops.c @@ -0,0 +1,2478 @@ +/* $NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $ */ + +/*- + * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Konrad E. Schroder . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/* + * Copyright (c) 1986, 1989, 1991, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_vnops.c 8.13 (Berkeley) 6/10/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $"); + +#ifdef _KERNEL_OPT +#include "opt_compat_netbsd.h" +#include "opt_uvm_page_trkown.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +extern pid_t lfs_writer_daemon; +int lfs_ignore_lazy_sync = 1; + +/* Global vfs data structures for lfs. */ +int (**lfs_vnodeop_p)(void *); +const struct vnodeopv_entry_desc lfs_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, ufs_lookup }, /* lookup */ + { &vop_create_desc, lfs_create }, /* create */ + { &vop_whiteout_desc, ufs_whiteout }, /* whiteout */ + { &vop_mknod_desc, lfs_mknod }, /* mknod */ + { &vop_open_desc, ufs_open }, /* open */ + { &vop_close_desc, lfs_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, lfs_getattr }, /* getattr */ + { &vop_setattr_desc, lfs_setattr }, /* setattr */ + { &vop_read_desc, lfs_read }, /* read */ + { &vop_write_desc, lfs_write }, /* write */ + { &vop_ioctl_desc, ufs_ioctl }, /* ioctl */ + { &vop_fcntl_desc, lfs_fcntl }, /* fcntl */ + { &vop_poll_desc, ufs_poll }, /* poll */ + { &vop_kqfilter_desc, genfs_kqfilter }, /* kqfilter */ + { &vop_revoke_desc, ufs_revoke }, /* revoke */ + { &vop_mmap_desc, lfs_mmap }, /* mmap */ + { &vop_fsync_desc, lfs_fsync }, /* fsync */ + { &vop_seek_desc, ufs_seek }, /* seek */ + { &vop_remove_desc, lfs_remove }, /* remove */ + { &vop_link_desc, lfs_link }, /* link */ + { &vop_rename_desc, lfs_rename }, /* rename */ + { &vop_mkdir_desc, lfs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, lfs_rmdir }, /* rmdir */ + { &vop_symlink_desc, lfs_symlink }, /* symlink */ + { &vop_readdir_desc, ufs_readdir }, /* readdir */ + { &vop_readlink_desc, ufs_readlink }, /* readlink */ + { &vop_abortop_desc, ufs_abortop }, /* abortop */ + { &vop_inactive_desc, lfs_inactive }, /* inactive */ + { &vop_reclaim_desc, lfs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, ufs_bmap }, /* bmap */ + { &vop_strategy_desc, lfs_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */ + { &vop_advlock_desc, ufs_advlock }, /* advlock */ + { &vop_bwrite_desc, lfs_bwrite }, /* bwrite */ + { &vop_getpages_desc, lfs_getpages }, /* getpages */ + { &vop_putpages_desc, lfs_putpages }, /* putpages */ + { NULL, NULL } +}; +const struct vnodeopv_desc lfs_vnodeop_opv_desc = + { &lfs_vnodeop_p, lfs_vnodeop_entries }; + +int (**lfs_specop_p)(void *); +const struct vnodeopv_entry_desc lfs_specop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, spec_lookup }, /* lookup */ + { &vop_create_desc, spec_create }, /* create */ + { &vop_mknod_desc, spec_mknod }, /* mknod */ + { &vop_open_desc, spec_open }, /* open */ + { &vop_close_desc, lfsspec_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, lfs_getattr }, /* getattr */ + { &vop_setattr_desc, lfs_setattr }, /* setattr */ + { &vop_read_desc, ufsspec_read }, /* read */ + { &vop_write_desc, ufsspec_write }, /* write */ + { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ + { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */ + { &vop_poll_desc, spec_poll }, /* poll */ + { &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */ + { &vop_revoke_desc, spec_revoke }, /* revoke */ + { &vop_mmap_desc, spec_mmap }, /* mmap */ + { &vop_fsync_desc, spec_fsync }, /* fsync */ + { &vop_seek_desc, spec_seek }, /* seek */ + { &vop_remove_desc, spec_remove }, /* remove */ + { &vop_link_desc, spec_link }, /* link */ + { &vop_rename_desc, spec_rename }, /* rename */ + { &vop_mkdir_desc, spec_mkdir }, /* mkdir */ + { &vop_rmdir_desc, spec_rmdir }, /* rmdir */ + { &vop_symlink_desc, spec_symlink }, /* symlink */ + { &vop_readdir_desc, spec_readdir }, /* readdir */ + { &vop_readlink_desc, spec_readlink }, /* readlink */ + { &vop_abortop_desc, spec_abortop }, /* abortop */ + { &vop_inactive_desc, lfs_inactive }, /* inactive */ + { &vop_reclaim_desc, lfs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, spec_bmap }, /* bmap */ + { &vop_strategy_desc, spec_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ + { &vop_advlock_desc, spec_advlock }, /* advlock */ + { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ + { &vop_getpages_desc, spec_getpages }, /* getpages */ + { &vop_putpages_desc, spec_putpages }, /* putpages */ + { NULL, NULL } +}; +const struct vnodeopv_desc lfs_specop_opv_desc = + { &lfs_specop_p, lfs_specop_entries }; + +int (**lfs_fifoop_p)(void *); +const struct vnodeopv_entry_desc lfs_fifoop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, vn_fifo_bypass }, /* lookup */ + { &vop_create_desc, vn_fifo_bypass }, /* create */ + { &vop_mknod_desc, vn_fifo_bypass }, /* mknod */ + { &vop_open_desc, vn_fifo_bypass }, /* open */ + { &vop_close_desc, lfsfifo_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, lfs_getattr }, /* getattr */ + { &vop_setattr_desc, lfs_setattr }, /* setattr */ + { &vop_read_desc, ufsfifo_read }, /* read */ + { &vop_write_desc, ufsfifo_write }, /* write */ + { &vop_ioctl_desc, vn_fifo_bypass }, /* ioctl */ + { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */ + { &vop_poll_desc, vn_fifo_bypass }, /* poll */ + { &vop_kqfilter_desc, vn_fifo_bypass }, /* kqfilter */ + { &vop_revoke_desc, vn_fifo_bypass }, /* revoke */ + { &vop_mmap_desc, vn_fifo_bypass }, /* mmap */ + { &vop_fsync_desc, vn_fifo_bypass }, /* fsync */ + { &vop_seek_desc, vn_fifo_bypass }, /* seek */ + { &vop_remove_desc, vn_fifo_bypass }, /* remove */ + { &vop_link_desc, vn_fifo_bypass }, /* link */ + { &vop_rename_desc, vn_fifo_bypass }, /* rename */ + { &vop_mkdir_desc, vn_fifo_bypass }, /* mkdir */ + { &vop_rmdir_desc, vn_fifo_bypass }, /* rmdir */ + { &vop_symlink_desc, vn_fifo_bypass }, /* symlink */ + { &vop_readdir_desc, vn_fifo_bypass }, /* readdir */ + { &vop_readlink_desc, vn_fifo_bypass }, /* readlink */ + { &vop_abortop_desc, vn_fifo_bypass }, /* abortop */ + { &vop_inactive_desc, lfs_inactive }, /* inactive */ + { &vop_reclaim_desc, lfs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, vn_fifo_bypass }, /* bmap */ + { &vop_strategy_desc, vn_fifo_bypass }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, vn_fifo_bypass }, /* pathconf */ + { &vop_advlock_desc, vn_fifo_bypass }, /* advlock */ + { &vop_bwrite_desc, lfs_bwrite }, /* bwrite */ + { &vop_putpages_desc, vn_fifo_bypass }, /* putpages */ + { NULL, NULL } +}; +const struct vnodeopv_desc lfs_fifoop_opv_desc = + { &lfs_fifoop_p, lfs_fifoop_entries }; + +static int check_dirty(struct lfs *, struct vnode *, off_t, off_t, off_t, int, int, struct vm_page **); + +#define LFS_READWRITE +#include +#undef LFS_READWRITE + +/* + * Synch an open file. + */ +/* ARGSUSED */ +int +lfs_fsync(void *v) +{ + struct vop_fsync_args /* { + struct vnode *a_vp; + kauth_cred_t a_cred; + int a_flags; + off_t offlo; + off_t offhi; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + int error, wait; + struct inode *ip = VTOI(vp); + struct lfs *fs = ip->i_lfs; + + /* If we're mounted read-only, don't try to sync. */ + if (fs->lfs_ronly) + return 0; + + /* If a removed vnode is being cleaned, no need to sync here. */ + if ((ap->a_flags & FSYNC_RECLAIM) != 0 && ip->i_mode == 0) + return 0; + + /* + * Trickle sync simply adds this vnode to the pager list, as if + * the pagedaemon had requested a pageout. + */ + if (ap->a_flags & FSYNC_LAZY) { + if (lfs_ignore_lazy_sync == 0) { + mutex_enter(&lfs_lock); + if (!(ip->i_flags & IN_PAGING)) { + ip->i_flags |= IN_PAGING; + TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip, + i_lfs_pchain); + } + wakeup(&lfs_writer_daemon); + mutex_exit(&lfs_lock); + } + return 0; + } + + /* + * If a vnode is bring cleaned, flush it out before we try to + * reuse it. This prevents the cleaner from writing files twice + * in the same partial segment, causing an accounting underflow. + */ + if (ap->a_flags & FSYNC_RECLAIM && ip->i_flags & IN_CLEANING) { + lfs_vflush(vp); + } + + wait = (ap->a_flags & FSYNC_WAIT); + do { + mutex_enter(vp->v_interlock); + error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo), + round_page(ap->a_offhi), + PGO_CLEANIT | (wait ? PGO_SYNCIO : 0)); + if (error == EAGAIN) { + mutex_enter(&lfs_lock); + mtsleep(&fs->lfs_avail, PCATCH | PUSER, "lfs_fsync", + hz / 100 + 1, &lfs_lock); + mutex_exit(&lfs_lock); + } + } while (error == EAGAIN); + if (error) + return error; + + if ((ap->a_flags & FSYNC_DATAONLY) == 0) + error = lfs_update(vp, NULL, NULL, wait ? UPDATE_WAIT : 0); + + if (error == 0 && ap->a_flags & FSYNC_CACHE) { + int l = 0; + error = VOP_IOCTL(ip->i_devvp, DIOCCACHESYNC, &l, FWRITE, + curlwp->l_cred); + } + if (wait && !VPISEMPTY(vp)) + LFS_SET_UINO(ip, IN_MODIFIED); + + return error; +} + +/* + * Take IN_ADIROP off, then call ufs_inactive. + */ +int +lfs_inactive(void *v) +{ + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap = v; + + lfs_unmark_vnode(ap->a_vp); + + /* + * The Ifile is only ever inactivated on unmount. + * Streamline this process by not giving it more dirty blocks. + */ + if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM) { + mutex_enter(&lfs_lock); + LFS_CLR_UINO(VTOI(ap->a_vp), IN_ALLMOD); + mutex_exit(&lfs_lock); + VOP_UNLOCK(ap->a_vp); + return 0; + } + + return ufs_inactive(v); +} + +/* + * These macros are used to bracket UFS directory ops, so that we can + * identify all the pages touched during directory ops which need to + * be ordered and flushed atomically, so that they may be recovered. + * + * Because we have to mark nodes VU_DIROP in order to prevent + * the cache from reclaiming them while a dirop is in progress, we must + * also manage the number of nodes so marked (otherwise we can run out). + * We do this by setting lfs_dirvcount to the number of marked vnodes; it + * is decremented during segment write, when VU_DIROP is taken off. + */ +#define MARK_VNODE(vp) lfs_mark_vnode(vp) +#define UNMARK_VNODE(vp) lfs_unmark_vnode(vp) +#define SET_DIROP_CREATE(dvp, vpp) lfs_set_dirop_create((dvp), (vpp)) +#define SET_DIROP_REMOVE(dvp, vp) lfs_set_dirop((dvp), (vp)) +static int lfs_set_dirop_create(struct vnode *, struct vnode **); +static int lfs_set_dirop(struct vnode *, struct vnode *); + +static int +lfs_set_dirop(struct vnode *dvp, struct vnode *vp) +{ + struct lfs *fs; + int error; + + KASSERT(VOP_ISLOCKED(dvp)); + KASSERT(vp == NULL || VOP_ISLOCKED(vp)); + + fs = VTOI(dvp)->i_lfs; + + ASSERT_NO_SEGLOCK(fs); + /* + * LFS_NRESERVE calculates direct and indirect blocks as well + * as an inode block; an overestimate in most cases. + */ + if ((error = lfs_reserve(fs, dvp, vp, LFS_NRESERVE(fs))) != 0) + return (error); + + restart: + mutex_enter(&lfs_lock); + if (fs->lfs_dirops == 0) { + mutex_exit(&lfs_lock); + lfs_check(dvp, LFS_UNUSED_LBN, 0); + mutex_enter(&lfs_lock); + } + while (fs->lfs_writer) { + error = mtsleep(&fs->lfs_dirops, (PRIBIO + 1) | PCATCH, + "lfs_sdirop", 0, &lfs_lock); + if (error == EINTR) { + mutex_exit(&lfs_lock); + goto unreserve; + } + } + if (lfs_dirvcount > LFS_MAX_DIROP && fs->lfs_dirops == 0) { + wakeup(&lfs_writer_daemon); + mutex_exit(&lfs_lock); + preempt(); + goto restart; + } + + if (lfs_dirvcount > LFS_MAX_DIROP) { + mutex_exit(&lfs_lock); + DLOG((DLOG_DIROP, "lfs_set_dirop: sleeping with dirops=%d, " + "dirvcount=%d\n", fs->lfs_dirops, lfs_dirvcount)); + if ((error = mtsleep(&lfs_dirvcount, + PCATCH | PUSER | PNORELOCK, "lfs_maxdirop", 0, + &lfs_lock)) != 0) { + goto unreserve; + } + goto restart; + } + + ++fs->lfs_dirops; + fs->lfs_doifile = 1; + mutex_exit(&lfs_lock); + + /* Hold a reference so SET_ENDOP will be happy */ + vref(dvp); + if (vp) { + vref(vp); + MARK_VNODE(vp); + } + + MARK_VNODE(dvp); + return 0; + + unreserve: + lfs_reserve(fs, dvp, vp, -LFS_NRESERVE(fs)); + return error; +} + +/* + * Get a new vnode *before* adjusting the dirop count, to avoid a deadlock + * in getnewvnode(), if we have a stacked filesystem mounted on top + * of us. + * + * NB: this means we have to clear the new vnodes on error. Fortunately + * SET_ENDOP is there to do that for us. + */ +static int +lfs_set_dirop_create(struct vnode *dvp, struct vnode **vpp) +{ + int error; + struct lfs *fs; + + fs = VFSTOUFS(dvp->v_mount)->um_lfs; + ASSERT_NO_SEGLOCK(fs); + if (fs->lfs_ronly) + return EROFS; + if (vpp == NULL) { + return lfs_set_dirop(dvp, NULL); + } + error = getnewvnode(VT_LFS, dvp->v_mount, lfs_vnodeop_p, NULL, vpp); + if (error) { + DLOG((DLOG_ALLOC, "lfs_set_dirop_create: dvp %p error %d\n", + dvp, error)); + return error; + } + if ((error = lfs_set_dirop(dvp, NULL)) != 0) { + ungetnewvnode(*vpp); + *vpp = NULL; + return error; + } + return 0; +} + +#define SET_ENDOP_BASE(fs, dvp, str) \ + do { \ + mutex_enter(&lfs_lock); \ + --(fs)->lfs_dirops; \ + if (!(fs)->lfs_dirops) { \ + if ((fs)->lfs_nadirop) { \ + panic("SET_ENDOP: %s: no dirops but " \ + " nadirop=%d", (str), \ + (fs)->lfs_nadirop); \ + } \ + wakeup(&(fs)->lfs_writer); \ + mutex_exit(&lfs_lock); \ + lfs_check((dvp), LFS_UNUSED_LBN, 0); \ + } else \ + mutex_exit(&lfs_lock); \ + } while(0) +#define SET_ENDOP_CREATE(fs, dvp, nvpp, str) \ + do { \ + UNMARK_VNODE(dvp); \ + if (nvpp && *nvpp) \ + UNMARK_VNODE(*nvpp); \ + /* Check for error return to stem vnode leakage */ \ + if (nvpp && *nvpp && !((*nvpp)->v_uflag & VU_DIROP)) \ + ungetnewvnode(*(nvpp)); \ + SET_ENDOP_BASE((fs), (dvp), (str)); \ + lfs_reserve((fs), (dvp), NULL, -LFS_NRESERVE(fs)); \ + vrele(dvp); \ + } while(0) +#define SET_ENDOP_CREATE_AP(ap, str) \ + SET_ENDOP_CREATE(VTOI((ap)->a_dvp)->i_lfs, (ap)->a_dvp, \ + (ap)->a_vpp, (str)) +#define SET_ENDOP_REMOVE(fs, dvp, ovp, str) \ + do { \ + UNMARK_VNODE(dvp); \ + if (ovp) \ + UNMARK_VNODE(ovp); \ + SET_ENDOP_BASE((fs), (dvp), (str)); \ + lfs_reserve((fs), (dvp), (ovp), -LFS_NRESERVE(fs)); \ + vrele(dvp); \ + if (ovp) \ + vrele(ovp); \ + } while(0) + +void +lfs_mark_vnode(struct vnode *vp) +{ + struct inode *ip = VTOI(vp); + struct lfs *fs = ip->i_lfs; + + mutex_enter(&lfs_lock); + if (!(ip->i_flag & IN_ADIROP)) { + if (!(vp->v_uflag & VU_DIROP)) { + mutex_enter(vp->v_interlock); + (void)lfs_vref(vp); + ++lfs_dirvcount; + ++fs->lfs_dirvcount; + TAILQ_INSERT_TAIL(&fs->lfs_dchainhd, ip, i_lfs_dchain); + vp->v_uflag |= VU_DIROP; + } + ++fs->lfs_nadirop; + ip->i_flag |= IN_ADIROP; + } else + KASSERT(vp->v_uflag & VU_DIROP); + mutex_exit(&lfs_lock); +} + +void +lfs_unmark_vnode(struct vnode *vp) +{ + struct inode *ip = VTOI(vp); + + if (ip && (ip->i_flag & IN_ADIROP)) { + KASSERT(vp->v_uflag & VU_DIROP); + mutex_enter(&lfs_lock); + --ip->i_lfs->lfs_nadirop; + mutex_exit(&lfs_lock); + ip->i_flag &= ~IN_ADIROP; + } +} + +int +lfs_symlink(void *v) +{ + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap = v; + int error; + + if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) { + vput(ap->a_dvp); + return error; + } + error = ufs_symlink(ap); + SET_ENDOP_CREATE_AP(ap, "symlink"); + return (error); +} + +int +lfs_mknod(void *v) +{ + struct vop_mknod_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap = v; + struct vattr *vap = ap->a_vap; + struct vnode **vpp = ap->a_vpp; + struct inode *ip; + int error; + struct mount *mp; + ino_t ino; + struct ufs_lookup_results *ulr; + + /* XXX should handle this material another way */ + ulr = &VTOI(ap->a_dvp)->i_crap; + UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp)); + + if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) { + vput(ap->a_dvp); + return error; + } + error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), + ap->a_dvp, ulr, vpp, ap->a_cnp); + + /* Either way we're done with the dirop at this point */ + SET_ENDOP_CREATE_AP(ap, "mknod"); + + if (error) + return (error); + + ip = VTOI(*vpp); + mp = (*vpp)->v_mount; + ino = ip->i_number; + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + if (vap->va_rdev != VNOVAL) { + /* + * Want to be able to use this to make badblock + * inodes, so don't truncate the dev number. + */ +#if 0 + ip->i_ffs1_rdev = ufs_rw32(vap->va_rdev, + UFS_MPNEEDSWAP((*vpp)->v_mount)); +#else + ip->i_ffs1_rdev = vap->va_rdev; +#endif + } + + /* + * Call fsync to write the vnode so that we don't have to deal with + * flushing it when it's marked VU_DIROP|VI_XLOCK. + * + * XXX KS - If we can't flush we also can't call vgone(), so must + * return. But, that leaves this vnode in limbo, also not good. + * Can this ever happen (barring hardware failure)? + */ + if ((error = VOP_FSYNC(*vpp, NOCRED, FSYNC_WAIT, 0, 0)) != 0) { + panic("lfs_mknod: couldn't fsync (ino %llu)", + (unsigned long long)ino); + /* return (error); */ + } + /* + * Remove vnode so that it will be reloaded by VFS_VGET and + * checked to see if it is an alias of an existing entry in + * the inode cache. + */ + /* Used to be vput, but that causes us to call VOP_INACTIVE twice. */ + + VOP_UNLOCK(*vpp); + (*vpp)->v_type = VNON; + vgone(*vpp); + error = VFS_VGET(mp, ino, vpp); + + if (error != 0) { + *vpp = NULL; + return (error); + } + return (0); +} + +int +lfs_create(void *v) +{ + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap = v; + int error; + + if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) { + vput(ap->a_dvp); + return error; + } + error = ufs_create(ap); + SET_ENDOP_CREATE_AP(ap, "create"); + return (error); +} + +int +lfs_mkdir(void *v) +{ + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap = v; + int error; + + if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) { + vput(ap->a_dvp); + return error; + } + error = ufs_mkdir(ap); + SET_ENDOP_CREATE_AP(ap, "mkdir"); + return (error); +} + +int +lfs_remove(void *v) +{ + struct vop_remove_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap = v; + struct vnode *dvp, *vp; + struct inode *ip; + int error; + + dvp = ap->a_dvp; + vp = ap->a_vp; + ip = VTOI(vp); + if ((error = SET_DIROP_REMOVE(dvp, vp)) != 0) { + if (dvp == vp) + vrele(vp); + else + vput(vp); + vput(dvp); + return error; + } + error = ufs_remove(ap); + if (ip->i_nlink == 0) + lfs_orphan(ip->i_lfs, ip->i_number); + SET_ENDOP_REMOVE(ip->i_lfs, dvp, ap->a_vp, "remove"); + return (error); +} + +int +lfs_rmdir(void *v) +{ + struct vop_rmdir_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap = v; + struct vnode *vp; + struct inode *ip; + int error; + + vp = ap->a_vp; + ip = VTOI(vp); + if ((error = SET_DIROP_REMOVE(ap->a_dvp, ap->a_vp)) != 0) { + if (ap->a_dvp == vp) + vrele(ap->a_dvp); + else + vput(ap->a_dvp); + vput(vp); + return error; + } + error = ufs_rmdir(ap); + if (ip->i_nlink == 0) + lfs_orphan(ip->i_lfs, ip->i_number); + SET_ENDOP_REMOVE(ip->i_lfs, ap->a_dvp, ap->a_vp, "rmdir"); + return (error); +} + +int +lfs_link(void *v) +{ + struct vop_link_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap = v; + int error; + struct vnode **vpp = NULL; + + if ((error = SET_DIROP_CREATE(ap->a_dvp, vpp)) != 0) { + vput(ap->a_dvp); + return error; + } + error = ufs_link(ap); + SET_ENDOP_CREATE(VTOI(ap->a_dvp)->i_lfs, ap->a_dvp, vpp, "link"); + return (error); +} + +int +lfs_rename(void *v) +{ + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap = v; + struct vnode *tvp, *fvp, *tdvp, *fdvp; + struct componentname *tcnp, *fcnp; + int error; + struct lfs *fs; + + fs = VTOI(ap->a_fdvp)->i_lfs; + tvp = ap->a_tvp; + tdvp = ap->a_tdvp; + tcnp = ap->a_tcnp; + fvp = ap->a_fvp; + fdvp = ap->a_fdvp; + fcnp = ap->a_fcnp; + + /* + * Check for cross-device rename. + * If it is, we don't want to set dirops, just error out. + * (In particular note that MARK_VNODE(tdvp) will DTWT on + * a cross-device rename.) + * + * Copied from ufs_rename. + */ + if ((fvp->v_mount != tdvp->v_mount) || + (tvp && (fvp->v_mount != tvp->v_mount))) { + error = EXDEV; + goto errout; + } + + /* + * Check to make sure we're not renaming a vnode onto itself + * (deleting a hard link by renaming one name onto another); + * if we are we can't recursively call VOP_REMOVE since that + * would leave us with an unaccounted-for number of live dirops. + * + * Inline the relevant section of ufs_rename here, *before* + * calling SET_DIROP_REMOVE. + */ + if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) || + (VTOI(tdvp)->i_flags & APPEND))) { + error = EPERM; + goto errout; + } + if (fvp == tvp) { + if (fvp->v_type == VDIR) { + error = EINVAL; + goto errout; + } + + /* Release destination completely. */ + VOP_ABORTOP(tdvp, tcnp); + vput(tdvp); + vput(tvp); + + /* Delete source. */ + vrele(fvp); + fcnp->cn_flags &= ~(MODMASK); + fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; + fcnp->cn_nameiop = DELETE; + vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY); + if ((error = relookup(fdvp, &fvp, fcnp, 0))) { + vput(fdvp); + return (error); + } + return (VOP_REMOVE(fdvp, fvp, fcnp)); + } + + if ((error = SET_DIROP_REMOVE(tdvp, tvp)) != 0) + goto errout; + MARK_VNODE(fdvp); + MARK_VNODE(fvp); + + error = ufs_rename(ap); + UNMARK_VNODE(fdvp); + UNMARK_VNODE(fvp); + SET_ENDOP_REMOVE(fs, tdvp, tvp, "rename"); + return (error); + + errout: + VOP_ABORTOP(tdvp, ap->a_tcnp); /* XXX, why not in NFS? */ + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + VOP_ABORTOP(fdvp, ap->a_fcnp); /* XXX, why not in NFS? */ + vrele(fdvp); + vrele(fvp); + return (error); +} + +/* XXX hack to avoid calling ITIMES in getattr */ +int +lfs_getattr(void *v) +{ + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct vattr *vap = ap->a_vap; + struct lfs *fs = ip->i_lfs; + /* + * Copy from inode table + */ + vap->va_fsid = ip->i_dev; + vap->va_fileid = ip->i_number; + vap->va_mode = ip->i_mode & ~IFMT; + vap->va_nlink = ip->i_nlink; + vap->va_uid = ip->i_uid; + vap->va_gid = ip->i_gid; + vap->va_rdev = (dev_t)ip->i_ffs1_rdev; + vap->va_size = vp->v_size; + vap->va_atime.tv_sec = ip->i_ffs1_atime; + vap->va_atime.tv_nsec = ip->i_ffs1_atimensec; + vap->va_mtime.tv_sec = ip->i_ffs1_mtime; + vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec; + vap->va_ctime.tv_sec = ip->i_ffs1_ctime; + vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec; + vap->va_flags = ip->i_flags; + vap->va_gen = ip->i_gen; + /* this doesn't belong here */ + if (vp->v_type == VBLK) + vap->va_blocksize = BLKDEV_IOSIZE; + else if (vp->v_type == VCHR) + vap->va_blocksize = MAXBSIZE; + else + vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; + vap->va_bytes = fsbtob(fs, (u_quad_t)ip->i_lfs_effnblks); + vap->va_type = vp->v_type; + vap->va_filerev = ip->i_modrev; + return (0); +} + +/* + * Check to make sure the inode blocks won't choke the buffer + * cache, then call ufs_setattr as usual. + */ +int +lfs_setattr(void *v) +{ + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + + lfs_check(vp, LFS_UNUSED_LBN, 0); + return ufs_setattr(v); +} + +/* + * Release the block we hold on lfs_newseg wrapping. Called on file close, + * or explicitly from LFCNWRAPGO. Called with the interlock held. + */ +static int +lfs_wrapgo(struct lfs *fs, struct inode *ip, int waitfor) +{ + if (fs->lfs_stoplwp != curlwp) + return EBUSY; + + fs->lfs_stoplwp = NULL; + cv_signal(&fs->lfs_stopcv); + + KASSERT(fs->lfs_nowrap > 0); + if (fs->lfs_nowrap <= 0) { + return 0; + } + + if (--fs->lfs_nowrap == 0) { + log(LOG_NOTICE, "%s: re-enabled log wrap\n", fs->lfs_fsmnt); + wakeup(&fs->lfs_wrappass); + lfs_wakeup_cleaner(fs); + } + if (waitfor) { + mtsleep(&fs->lfs_nextseg, PCATCH | PUSER, "segment", + 0, &lfs_lock); + } + + return 0; +} + +/* + * Close called + */ +/* ARGSUSED */ +int +lfs_close(void *v) +{ + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct lfs *fs = ip->i_lfs; + + if ((ip->i_number == ROOTINO || ip->i_number == LFS_IFILE_INUM) && + fs->lfs_stoplwp == curlwp) { + mutex_enter(&lfs_lock); + log(LOG_NOTICE, "lfs_close: releasing log wrap control\n"); + lfs_wrapgo(fs, ip, 0); + mutex_exit(&lfs_lock); + } + + if (vp == ip->i_lfs->lfs_ivnode && + vp->v_mount->mnt_iflag & IMNT_UNMOUNT) + return 0; + + if (vp->v_usecount > 1 && vp != ip->i_lfs->lfs_ivnode) { + LFS_ITIMES(ip, NULL, NULL, NULL); + } + return (0); +} + +/* + * Close wrapper for special devices. + * + * Update the times on the inode then do device close. + */ +int +lfsspec_close(void *v) +{ + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp; + struct inode *ip; + + vp = ap->a_vp; + ip = VTOI(vp); + if (vp->v_usecount > 1) { + LFS_ITIMES(ip, NULL, NULL, NULL); + } + return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap)); +} + +/* + * Close wrapper for fifo's. + * + * Update the times on the inode then do device close. + */ +int +lfsfifo_close(void *v) +{ + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + kauth_cred_ a_cred; + } */ *ap = v; + struct vnode *vp; + struct inode *ip; + + vp = ap->a_vp; + ip = VTOI(vp); + if (ap->a_vp->v_usecount > 1) { + LFS_ITIMES(ip, NULL, NULL, NULL); + } + return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap)); +} + +/* + * Reclaim an inode so that it can be used for other purposes. + */ + +int +lfs_reclaim(void *v) +{ + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct lfs *fs = ip->i_lfs; + int error; + + /* + * The inode must be freed and updated before being removed + * from its hash chain. Other threads trying to gain a hold + * on the inode will be stalled because it is locked (VI_XLOCK). + */ + if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) + lfs_vfree(vp, ip->i_number, ip->i_omode); + + mutex_enter(&lfs_lock); + LFS_CLR_UINO(ip, IN_ALLMOD); + mutex_exit(&lfs_lock); + if ((error = ufs_reclaim(vp))) + return (error); + + /* + * Take us off the paging and/or dirop queues if we were on them. + * We shouldn't be on them. + */ + mutex_enter(&lfs_lock); + if (ip->i_flags & IN_PAGING) { + log(LOG_WARNING, "%s: reclaimed vnode is IN_PAGING\n", + fs->lfs_fsmnt); + ip->i_flags &= ~IN_PAGING; + TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain); + } + if (vp->v_uflag & VU_DIROP) { + panic("reclaimed vnode is VU_DIROP"); + vp->v_uflag &= ~VU_DIROP; + TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain); + } + mutex_exit(&lfs_lock); + + pool_put(&lfs_dinode_pool, ip->i_din.ffs1_din); + lfs_deregister_all(vp); + pool_put(&lfs_inoext_pool, ip->inode_ext.lfs); + ip->inode_ext.lfs = NULL; + genfs_node_destroy(vp); + pool_put(&lfs_inode_pool, vp->v_data); + vp->v_data = NULL; + return (0); +} + +/* + * Read a block from a storage device. + * In order to avoid reading blocks that are in the process of being + * written by the cleaner---and hence are not mutexed by the normal + * buffer cache / page cache mechanisms---check for collisions before + * reading. + * + * We inline ufs_strategy to make sure that the VOP_BMAP occurs *before* + * the active cleaner test. + * + * XXX This code assumes that lfs_markv makes synchronous checkpoints. + */ +int +lfs_strategy(void *v) +{ + struct vop_strategy_args /* { + struct vnode *a_vp; + struct buf *a_bp; + } */ *ap = v; + struct buf *bp; + struct lfs *fs; + struct vnode *vp; + struct inode *ip; + daddr_t tbn; + int i, sn, error, slept; + + bp = ap->a_bp; + vp = ap->a_vp; + ip = VTOI(vp); + fs = ip->i_lfs; + + /* lfs uses its strategy routine only for read */ + KASSERT(bp->b_flags & B_READ); + + if (vp->v_type == VBLK || vp->v_type == VCHR) + panic("lfs_strategy: spec"); + KASSERT(bp->b_bcount != 0); + if (bp->b_blkno == bp->b_lblkno) { + error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, + NULL); + if (error) { + bp->b_error = error; + bp->b_resid = bp->b_bcount; + biodone(bp); + return (error); + } + if ((long)bp->b_blkno == -1) /* no valid data */ + clrbuf(bp); + } + if ((long)bp->b_blkno < 0) { /* block is not on disk */ + bp->b_resid = bp->b_bcount; + biodone(bp); + return (0); + } + + slept = 1; + mutex_enter(&lfs_lock); + while (slept && fs->lfs_seglock) { + mutex_exit(&lfs_lock); + /* + * Look through list of intervals. + * There will only be intervals to look through + * if the cleaner holds the seglock. + * Since the cleaner is synchronous, we can trust + * the list of intervals to be current. + */ + tbn = dbtofsb(fs, bp->b_blkno); + sn = dtosn(fs, tbn); + slept = 0; + for (i = 0; i < fs->lfs_cleanind; i++) { + if (sn == dtosn(fs, fs->lfs_cleanint[i]) && + tbn >= fs->lfs_cleanint[i]) { + DLOG((DLOG_CLEAN, + "lfs_strategy: ino %d lbn %" PRId64 + " ind %d sn %d fsb %" PRIx32 + " given sn %d fsb %" PRIx64 "\n", + ip->i_number, bp->b_lblkno, i, + dtosn(fs, fs->lfs_cleanint[i]), + fs->lfs_cleanint[i], sn, tbn)); + DLOG((DLOG_CLEAN, + "lfs_strategy: sleeping on ino %d lbn %" + PRId64 "\n", ip->i_number, bp->b_lblkno)); + mutex_enter(&lfs_lock); + if (LFS_SEGLOCK_HELD(fs) && fs->lfs_iocount) { + /* Cleaner can't wait for itself */ + mtsleep(&fs->lfs_iocount, + (PRIBIO + 1) | PNORELOCK, + "clean2", 0, + &lfs_lock); + slept = 1; + break; + } else if (fs->lfs_seglock) { + mtsleep(&fs->lfs_seglock, + (PRIBIO + 1) | PNORELOCK, + "clean1", 0, + &lfs_lock); + slept = 1; + break; + } + mutex_exit(&lfs_lock); + } + } + mutex_enter(&lfs_lock); + } + mutex_exit(&lfs_lock); + + vp = ip->i_devvp; + VOP_STRATEGY(vp, bp); + return (0); +} + +void +lfs_flush_dirops(struct lfs *fs) +{ + struct inode *ip, *nip; + struct vnode *vp; + extern int lfs_dostats; + struct segment *sp; + + ASSERT_MAYBE_SEGLOCK(fs); + KASSERT(fs->lfs_nadirop == 0); + + if (fs->lfs_ronly) + return; + + mutex_enter(&lfs_lock); + if (TAILQ_FIRST(&fs->lfs_dchainhd) == NULL) { + mutex_exit(&lfs_lock); + return; + } else + mutex_exit(&lfs_lock); + + if (lfs_dostats) + ++lfs_stats.flush_invoked; + + /* + * Inline lfs_segwrite/lfs_writevnodes, but just for dirops. + * Technically this is a checkpoint (the on-disk state is valid) + * even though we are leaving out all the file data. + */ + lfs_imtime(fs); + lfs_seglock(fs, SEGM_CKP); + sp = fs->lfs_sp; + + /* + * lfs_writevnodes, optimized to get dirops out of the way. + * Only write dirops, and don't flush files' pages, only + * blocks from the directories. + * + * We don't need to vref these files because they are + * dirops and so hold an extra reference until the + * segunlock clears them of that status. + * + * We don't need to check for IN_ADIROP because we know that + * no dirops are active. + * + */ + mutex_enter(&lfs_lock); + for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) { + nip = TAILQ_NEXT(ip, i_lfs_dchain); + mutex_exit(&lfs_lock); + vp = ITOV(ip); + + KASSERT((ip->i_flag & IN_ADIROP) == 0); + + /* + * All writes to directories come from dirops; all + * writes to files' direct blocks go through the page + * cache, which we're not touching. Reads to files + * and/or directories will not be affected by writing + * directory blocks inodes and file inodes. So we don't + * really need to lock. If we don't lock, though, + * make sure that we don't clear IN_MODIFIED + * unnecessarily. + */ + if (vp->v_iflag & VI_XLOCK) { + mutex_enter(&lfs_lock); + continue; + } + /* XXX see below + * waslocked = VOP_ISLOCKED(vp); + */ + if (vp->v_type != VREG && + ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp))) { + lfs_writefile(fs, sp, vp); + if (!VPISEMPTY(vp) && !WRITEINPROG(vp) && + !(ip->i_flag & IN_ALLMOD)) { + mutex_enter(&lfs_lock); + LFS_SET_UINO(ip, IN_MODIFIED); + mutex_exit(&lfs_lock); + } + } + KDASSERT(ip->i_number != LFS_IFILE_INUM); + (void) lfs_writeinode(fs, sp, ip); + mutex_enter(&lfs_lock); + /* + * XXX + * LK_EXCLOTHER is dead -- what is intended here? + * if (waslocked == LK_EXCLOTHER) + * LFS_SET_UINO(ip, IN_MODIFIED); + */ + } + mutex_exit(&lfs_lock); + /* We've written all the dirops there are */ + ((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT); + lfs_finalize_fs_seguse(fs); + (void) lfs_writeseg(fs, sp); + lfs_segunlock(fs); +} + +/* + * Flush all vnodes for which the pagedaemon has requested pageouts. + * Skip over any files that are marked VU_DIROP (since lfs_flush_dirop() + * has just run, this would be an error). If we have to skip a vnode + * for any reason, just skip it; if we have to wait for the cleaner, + * abort. The writer daemon will call us again later. + */ +void +lfs_flush_pchain(struct lfs *fs) +{ + struct inode *ip, *nip; + struct vnode *vp; + extern int lfs_dostats; + struct segment *sp; + int error; + + ASSERT_NO_SEGLOCK(fs); + + if (fs->lfs_ronly) + return; + + mutex_enter(&lfs_lock); + if (TAILQ_FIRST(&fs->lfs_pchainhd) == NULL) { + mutex_exit(&lfs_lock); + return; + } else + mutex_exit(&lfs_lock); + + /* Get dirops out of the way */ + lfs_flush_dirops(fs); + + if (lfs_dostats) + ++lfs_stats.flush_invoked; + + /* + * Inline lfs_segwrite/lfs_writevnodes, but just for pageouts. + */ + lfs_imtime(fs); + lfs_seglock(fs, 0); + sp = fs->lfs_sp; + + /* + * lfs_writevnodes, optimized to clear pageout requests. + * Only write non-dirop files that are in the pageout queue. + * We're very conservative about what we write; we want to be + * fast and async. + */ + mutex_enter(&lfs_lock); + top: + for (ip = TAILQ_FIRST(&fs->lfs_pchainhd); ip != NULL; ip = nip) { + nip = TAILQ_NEXT(ip, i_lfs_pchain); + vp = ITOV(ip); + + if (!(ip->i_flags & IN_PAGING)) + goto top; + + mutex_enter(vp->v_interlock); + if ((vp->v_iflag & VI_XLOCK) || (vp->v_uflag & VU_DIROP) != 0) { + mutex_exit(vp->v_interlock); + continue; + } + if (vp->v_type != VREG) { + mutex_exit(vp->v_interlock); + continue; + } + if (lfs_vref(vp)) + continue; + mutex_exit(&lfs_lock); + + if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_RETRY) != 0) { + lfs_vunref(vp); + mutex_enter(&lfs_lock); + continue; + } + + error = lfs_writefile(fs, sp, vp); + if (!VPISEMPTY(vp) && !WRITEINPROG(vp) && + !(ip->i_flag & IN_ALLMOD)) { + mutex_enter(&lfs_lock); + LFS_SET_UINO(ip, IN_MODIFIED); + mutex_exit(&lfs_lock); + } + KDASSERT(ip->i_number != LFS_IFILE_INUM); + (void) lfs_writeinode(fs, sp, ip); + + VOP_UNLOCK(vp); + lfs_vunref(vp); + + if (error == EAGAIN) { + lfs_writeseg(fs, sp); + mutex_enter(&lfs_lock); + break; + } + mutex_enter(&lfs_lock); + } + mutex_exit(&lfs_lock); + (void) lfs_writeseg(fs, sp); + lfs_segunlock(fs); +} + +/* + * Provide a fcntl interface to sys_lfs_{segwait,bmapv,markv}. + */ +int +lfs_fcntl(void *v) +{ + struct vop_fcntl_args /* { + struct vnode *a_vp; + u_int a_command; + void * a_data; + int a_fflag; + kauth_cred_t a_cred; + } */ *ap = v; + struct timeval tv; + struct timeval *tvp; + BLOCK_INFO *blkiov; + CLEANERINFO *cip; + SEGUSE *sup; + int blkcnt, error, oclean; + size_t fh_size; + struct lfs_fcntl_markv blkvp; + struct lwp *l; + fsid_t *fsidp; + struct lfs *fs; + struct buf *bp; + fhandle_t *fhp; + daddr_t off; + + /* Only respect LFS fcntls on fs root or Ifile */ + if (VTOI(ap->a_vp)->i_number != ROOTINO && + VTOI(ap->a_vp)->i_number != LFS_IFILE_INUM) { + return ufs_fcntl(v); + } + + /* Avoid locking a draining lock */ + if (ap->a_vp->v_mount->mnt_iflag & IMNT_UNMOUNT) { + return ESHUTDOWN; + } + + /* LFS control and monitoring fcntls are available only to root */ + l = curlwp; + if (((ap->a_command & 0xff00) >> 8) == 'L' && + (error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, + NULL)) != 0) + return (error); + + fs = VTOI(ap->a_vp)->i_lfs; + fsidp = &ap->a_vp->v_mount->mnt_stat.f_fsidx; + + error = 0; + switch ((int)ap->a_command) { + case LFCNSEGWAITALL_COMPAT_50: + case LFCNSEGWAITALL_COMPAT: + fsidp = NULL; + /* FALLSTHROUGH */ + case LFCNSEGWAIT_COMPAT_50: + case LFCNSEGWAIT_COMPAT: + { + struct timeval50 *tvp50 + = (struct timeval50 *)ap->a_data; + timeval50_to_timeval(tvp50, &tv); + tvp = &tv; + } + goto segwait_common; + case LFCNSEGWAITALL: + fsidp = NULL; + /* FALLSTHROUGH */ + case LFCNSEGWAIT: + tvp = (struct timeval *)ap->a_data; +segwait_common: + mutex_enter(&lfs_lock); + ++fs->lfs_sleepers; + mutex_exit(&lfs_lock); + + error = lfs_segwait(fsidp, tvp); + + mutex_enter(&lfs_lock); + if (--fs->lfs_sleepers == 0) + wakeup(&fs->lfs_sleepers); + mutex_exit(&lfs_lock); + return error; + + case LFCNBMAPV: + case LFCNMARKV: + blkvp = *(struct lfs_fcntl_markv *)ap->a_data; + + blkcnt = blkvp.blkcnt; + if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) + return (EINVAL); + blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); + if ((error = copyin(blkvp.blkiov, blkiov, + blkcnt * sizeof(BLOCK_INFO))) != 0) { + lfs_free(fs, blkiov, LFS_NB_BLKIOV); + return error; + } + + mutex_enter(&lfs_lock); + ++fs->lfs_sleepers; + mutex_exit(&lfs_lock); + if (ap->a_command == LFCNBMAPV) + error = lfs_bmapv(l->l_proc, fsidp, blkiov, blkcnt); + else /* LFCNMARKV */ + error = lfs_markv(l->l_proc, fsidp, blkiov, blkcnt); + if (error == 0) + error = copyout(blkiov, blkvp.blkiov, + blkcnt * sizeof(BLOCK_INFO)); + mutex_enter(&lfs_lock); + if (--fs->lfs_sleepers == 0) + wakeup(&fs->lfs_sleepers); + mutex_exit(&lfs_lock); + lfs_free(fs, blkiov, LFS_NB_BLKIOV); + return error; + + case LFCNRECLAIM: + /* + * Flush dirops and write Ifile, allowing empty segments + * to be immediately reclaimed. + */ + lfs_writer_enter(fs, "pndirop"); + off = fs->lfs_offset; + lfs_seglock(fs, SEGM_FORCE_CKP | SEGM_CKP); + lfs_flush_dirops(fs); + LFS_CLEANERINFO(cip, fs, bp); + oclean = cip->clean; + LFS_SYNC_CLEANERINFO(cip, fs, bp, 1); + lfs_segwrite(ap->a_vp->v_mount, SEGM_FORCE_CKP); + fs->lfs_sp->seg_flags |= SEGM_PROT; + lfs_segunlock(fs); + lfs_writer_leave(fs); + +#ifdef DEBUG + LFS_CLEANERINFO(cip, fs, bp); + DLOG((DLOG_CLEAN, "lfs_fcntl: reclaim wrote %" PRId64 + " blocks, cleaned %" PRId32 " segments (activesb %d)\n", + fs->lfs_offset - off, cip->clean - oclean, + fs->lfs_activesb)); + LFS_SYNC_CLEANERINFO(cip, fs, bp, 0); +#endif + + return 0; + + case LFCNIFILEFH_COMPAT: + /* Return the filehandle of the Ifile */ + if ((error = kauth_authorize_system(l->l_cred, + KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL)) != 0) + return (error); + fhp = (struct fhandle *)ap->a_data; + fhp->fh_fsid = *fsidp; + fh_size = 16; /* former VFS_MAXFIDSIZ */ + return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size); + + case LFCNIFILEFH_COMPAT2: + case LFCNIFILEFH: + /* Return the filehandle of the Ifile */ + fhp = (struct fhandle *)ap->a_data; + fhp->fh_fsid = *fsidp; + fh_size = sizeof(struct lfs_fhandle) - + offsetof(fhandle_t, fh_fid); + return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size); + + case LFCNREWIND: + /* Move lfs_offset to the lowest-numbered segment */ + return lfs_rewind(fs, *(int *)ap->a_data); + + case LFCNINVAL: + /* Mark a segment SEGUSE_INVAL */ + LFS_SEGENTRY(sup, fs, *(int *)ap->a_data, bp); + if (sup->su_nbytes > 0) { + brelse(bp, 0); + lfs_unset_inval_all(fs); + return EBUSY; + } + sup->su_flags |= SEGUSE_INVAL; + VOP_BWRITE(bp->b_vp, bp); + return 0; + + case LFCNRESIZE: + /* Resize the filesystem */ + return lfs_resize_fs(fs, *(int *)ap->a_data); + + case LFCNWRAPSTOP: + case LFCNWRAPSTOP_COMPAT: + /* + * Hold lfs_newseg at segment 0; if requested, sleep until + * the filesystem wraps around. To support external agents + * (dump, fsck-based regression test) that need to look at + * a snapshot of the filesystem, without necessarily + * requiring that all fs activity stops. + */ + if (fs->lfs_stoplwp == curlwp) + return EALREADY; + + mutex_enter(&lfs_lock); + while (fs->lfs_stoplwp != NULL) + cv_wait(&fs->lfs_stopcv, &lfs_lock); + fs->lfs_stoplwp = curlwp; + if (fs->lfs_nowrap == 0) + log(LOG_NOTICE, "%s: disabled log wrap\n", fs->lfs_fsmnt); + ++fs->lfs_nowrap; + if (*(int *)ap->a_data == 1 + || ap->a_command == LFCNWRAPSTOP_COMPAT) { + log(LOG_NOTICE, "LFCNSTOPWRAP waiting for log wrap\n"); + error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER, + "segwrap", 0, &lfs_lock); + log(LOG_NOTICE, "LFCNSTOPWRAP done waiting\n"); + if (error) { + lfs_wrapgo(fs, VTOI(ap->a_vp), 0); + } + } + mutex_exit(&lfs_lock); + return 0; + + case LFCNWRAPGO: + case LFCNWRAPGO_COMPAT: + /* + * Having done its work, the agent wakes up the writer. + * If the argument is 1, it sleeps until a new segment + * is selected. + */ + mutex_enter(&lfs_lock); + error = lfs_wrapgo(fs, VTOI(ap->a_vp), + ap->a_command == LFCNWRAPGO_COMPAT ? 1 : + *((int *)ap->a_data)); + mutex_exit(&lfs_lock); + return error; + + case LFCNWRAPPASS: + if ((VTOI(ap->a_vp)->i_lfs_iflags & LFSI_WRAPWAIT)) + return EALREADY; + mutex_enter(&lfs_lock); + if (fs->lfs_stoplwp != curlwp) { + mutex_exit(&lfs_lock); + return EALREADY; + } + if (fs->lfs_nowrap == 0) { + mutex_exit(&lfs_lock); + return EBUSY; + } + fs->lfs_wrappass = 1; + wakeup(&fs->lfs_wrappass); + /* Wait for the log to wrap, if asked */ + if (*(int *)ap->a_data) { + mutex_enter(ap->a_vp->v_interlock); + lfs_vref(ap->a_vp); + VTOI(ap->a_vp)->i_lfs_iflags |= LFSI_WRAPWAIT; + log(LOG_NOTICE, "LFCNPASS waiting for log wrap\n"); + error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER, + "segwrap", 0, &lfs_lock); + log(LOG_NOTICE, "LFCNPASS done waiting\n"); + VTOI(ap->a_vp)->i_lfs_iflags &= ~LFSI_WRAPWAIT; + lfs_vunref(ap->a_vp); + } + mutex_exit(&lfs_lock); + return error; + + case LFCNWRAPSTATUS: + mutex_enter(&lfs_lock); + *(int *)ap->a_data = fs->lfs_wrapstatus; + mutex_exit(&lfs_lock); + return 0; + + default: + return ufs_fcntl(v); + } + return 0; +} + +int +lfs_getpages(void *v) +{ + struct vop_getpages_args /* { + struct vnode *a_vp; + voff_t a_offset; + struct vm_page **a_m; + int *a_count; + int a_centeridx; + vm_prot_t a_access_type; + int a_advice; + int a_flags; + } */ *ap = v; + + if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM && + (ap->a_access_type & VM_PROT_WRITE) != 0) { + return EPERM; + } + if ((ap->a_access_type & VM_PROT_WRITE) != 0) { + mutex_enter(&lfs_lock); + LFS_SET_UINO(VTOI(ap->a_vp), IN_MODIFIED); + mutex_exit(&lfs_lock); + } + + /* + * we're relying on the fact that genfs_getpages() always read in + * entire filesystem blocks. + */ + return genfs_getpages(v); +} + +/* + * Wait for a page to become unbusy, possibly printing diagnostic messages + * as well. + * + * Called with vp->v_interlock held; return with it held. + */ +static void +wait_for_page(struct vnode *vp, struct vm_page *pg, const char *label) +{ + if ((pg->flags & PG_BUSY) == 0) + return; /* Nothing to wait for! */ + +#if defined(DEBUG) && defined(UVM_PAGE_TRKOWN) + static struct vm_page *lastpg; + + if (label != NULL && pg != lastpg) { + if (pg->owner_tag) { + printf("lfs_putpages[%d.%d]: %s: page %p owner %d.%d [%s]\n", + curproc->p_pid, curlwp->l_lid, label, + pg, pg->owner, pg->lowner, pg->owner_tag); + } else { + printf("lfs_putpages[%d.%d]: %s: page %p unowned?!\n", + curproc->p_pid, curlwp->l_lid, label, pg); + } + } + lastpg = pg; +#endif + + pg->flags |= PG_WANTED; + UVM_UNLOCK_AND_WAIT(pg, vp->v_interlock, 0, "lfsput", 0); + mutex_enter(vp->v_interlock); +} + +/* + * This routine is called by lfs_putpages() when it can't complete the + * write because a page is busy. This means that either (1) someone, + * possibly the pagedaemon, is looking at this page, and will give it up + * presently; or (2) we ourselves are holding the page busy in the + * process of being written (either gathered or actually on its way to + * disk). We don't need to give up the segment lock, but we might need + * to call lfs_writeseg() to expedite the page's journey to disk. + * + * Called with vp->v_interlock held; return with it held. + */ +/* #define BUSYWAIT */ +static void +write_and_wait(struct lfs *fs, struct vnode *vp, struct vm_page *pg, + int seglocked, const char *label) +{ +#ifndef BUSYWAIT + struct inode *ip = VTOI(vp); + struct segment *sp = fs->lfs_sp; + int count = 0; + + if (pg == NULL) + return; + + while (pg->flags & PG_BUSY && + pg->uobject == &vp->v_uobj) { + mutex_exit(vp->v_interlock); + if (sp->cbpp - sp->bpp > 1) { + /* Write gathered pages */ + lfs_updatemeta(sp); + lfs_release_finfo(fs); + (void) lfs_writeseg(fs, sp); + + /* + * Reinitialize FIP + */ + KASSERT(sp->vp == vp); + lfs_acquire_finfo(fs, ip->i_number, + ip->i_gen); + } + ++count; + mutex_enter(vp->v_interlock); + wait_for_page(vp, pg, label); + } + if (label != NULL && count > 1) + printf("lfs_putpages[%d]: %s: %sn = %d\n", curproc->p_pid, + label, (count > 0 ? "looping, " : ""), count); +#else + preempt(1); +#endif +} + +/* + * Make sure that for all pages in every block in the given range, + * either all are dirty or all are clean. If any of the pages + * we've seen so far are dirty, put the vnode on the paging chain, + * and mark it IN_PAGING. + * + * If checkfirst != 0, don't check all the pages but return at the + * first dirty page. + */ +static int +check_dirty(struct lfs *fs, struct vnode *vp, + off_t startoffset, off_t endoffset, off_t blkeof, + int flags, int checkfirst, struct vm_page **pgp) +{ + int by_list; + struct vm_page *curpg = NULL; /* XXX: gcc */ + struct vm_page *pgs[MAXBSIZE / PAGE_SIZE], *pg; + off_t soff = 0; /* XXX: gcc */ + voff_t off; + int i; + int nonexistent; + int any_dirty; /* number of dirty pages */ + int dirty; /* number of dirty pages in a block */ + int tdirty; + int pages_per_block = fs->lfs_bsize >> PAGE_SHIFT; + int pagedaemon = (curlwp == uvm.pagedaemon_lwp); + + ASSERT_MAYBE_SEGLOCK(fs); + top: + by_list = (vp->v_uobj.uo_npages <= + ((endoffset - startoffset) >> PAGE_SHIFT) * + UVM_PAGE_TREE_PENALTY); + any_dirty = 0; + + if (by_list) { + curpg = TAILQ_FIRST(&vp->v_uobj.memq); + } else { + soff = startoffset; + } + while (by_list || soff < MIN(blkeof, endoffset)) { + if (by_list) { + /* + * Find the first page in a block. Skip + * blocks outside our area of interest or beyond + * the end of file. + */ + KASSERT(curpg == NULL + || (curpg->flags & PG_MARKER) == 0); + if (pages_per_block > 1) { + while (curpg && + ((curpg->offset & fs->lfs_bmask) || + curpg->offset >= vp->v_size || + curpg->offset >= endoffset)) { + curpg = TAILQ_NEXT(curpg, listq.queue); + KASSERT(curpg == NULL || + (curpg->flags & PG_MARKER) == 0); + } + } + if (curpg == NULL) + break; + soff = curpg->offset; + } + + /* + * Mark all pages in extended range busy; find out if any + * of them are dirty. + */ + nonexistent = dirty = 0; + for (i = 0; i == 0 || i < pages_per_block; i++) { + if (by_list && pages_per_block <= 1) { + pgs[i] = pg = curpg; + } else { + off = soff + (i << PAGE_SHIFT); + pgs[i] = pg = uvm_pagelookup(&vp->v_uobj, off); + if (pg == NULL) { + ++nonexistent; + continue; + } + } + KASSERT(pg != NULL); + + /* + * If we're holding the segment lock, we can deadlock + * against a process that has our page and is waiting + * for the cleaner, while the cleaner waits for the + * segment lock. Just bail in that case. + */ + if ((pg->flags & PG_BUSY) && + (pagedaemon || LFS_SEGLOCK_HELD(fs))) { + if (i > 0) + uvm_page_unbusy(pgs, i); + DLOG((DLOG_PAGE, "lfs_putpages: avoiding 3-way or pagedaemon deadlock\n")); + if (pgp) + *pgp = pg; + return -1; + } + + while (pg->flags & PG_BUSY) { + wait_for_page(vp, pg, NULL); + if (i > 0) + uvm_page_unbusy(pgs, i); + goto top; + } + pg->flags |= PG_BUSY; + UVM_PAGE_OWN(pg, "lfs_putpages"); + + pmap_page_protect(pg, VM_PROT_NONE); + tdirty = (pmap_clear_modify(pg) || + (pg->flags & PG_CLEAN) == 0); + dirty += tdirty; + } + if (pages_per_block > 0 && nonexistent >= pages_per_block) { + if (by_list) { + curpg = TAILQ_NEXT(curpg, listq.queue); + } else { + soff += fs->lfs_bsize; + } + continue; + } + + any_dirty += dirty; + KASSERT(nonexistent == 0); + + /* + * If any are dirty make all dirty; unbusy them, + * but if we were asked to clean, wire them so that + * the pagedaemon doesn't bother us about them while + * they're on their way to disk. + */ + for (i = 0; i == 0 || i < pages_per_block; i++) { + pg = pgs[i]; + KASSERT(!((pg->flags & PG_CLEAN) && (pg->flags & PG_DELWRI))); + if (dirty) { + pg->flags &= ~PG_CLEAN; + if (flags & PGO_FREE) { + /* + * Wire the page so that + * pdaemon doesn't see it again. + */ + mutex_enter(&uvm_pageqlock); + uvm_pagewire(pg); + mutex_exit(&uvm_pageqlock); + + /* Suspended write flag */ + pg->flags |= PG_DELWRI; + } + } + if (pg->flags & PG_WANTED) + wakeup(pg); + pg->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(pg, NULL); + } + + if (checkfirst && any_dirty) + break; + + if (by_list) { + curpg = TAILQ_NEXT(curpg, listq.queue); + } else { + soff += MAX(PAGE_SIZE, fs->lfs_bsize); + } + } + + return any_dirty; +} + +/* + * lfs_putpages functions like genfs_putpages except that + * + * (1) It needs to bounds-check the incoming requests to ensure that + * they are block-aligned; if they are not, expand the range and + * do the right thing in case, e.g., the requested range is clean + * but the expanded range is dirty. + * + * (2) It needs to explicitly send blocks to be written when it is done. + * If VOP_PUTPAGES is called without the seglock held, we simply take + * the seglock and let lfs_segunlock wait for us. + * XXX There might be a bad situation if we have to flush a vnode while + * XXX lfs_markv is in operation. As of this writing we panic in this + * XXX case. + * + * Assumptions: + * + * (1) The caller does not hold any pages in this vnode busy. If it does, + * there is a danger that when we expand the page range and busy the + * pages we will deadlock. + * + * (2) We are called with vp->v_interlock held; we must return with it + * released. + * + * (3) We don't absolutely have to free pages right away, provided that + * the request does not have PGO_SYNCIO. When the pagedaemon gives + * us a request with PGO_FREE, we take the pages out of the paging + * queue and wake up the writer, which will handle freeing them for us. + * + * We ensure that for any filesystem block, all pages for that + * block are either resident or not, even if those pages are higher + * than EOF; that means that we will be getting requests to free + * "unused" pages above EOF all the time, and should ignore them. + * + * (4) If we are called with PGO_LOCKED, the finfo array we are to write + * into has been set up for us by lfs_writefile. If not, we will + * have to handle allocating and/or freeing an finfo entry. + * + * XXX note that we're (ab)using PGO_LOCKED as "seglock held". + */ + +/* How many times to loop before we should start to worry */ +#define TOOMANY 4 + +int +lfs_putpages(void *v) +{ + int error; + struct vop_putpages_args /* { + struct vnode *a_vp; + voff_t a_offlo; + voff_t a_offhi; + int a_flags; + } */ *ap = v; + struct vnode *vp; + struct inode *ip; + struct lfs *fs; + struct segment *sp; + off_t origoffset, startoffset, endoffset, origendoffset, blkeof; + off_t off, max_endoffset; + bool seglocked, sync, pagedaemon; + struct vm_page *pg, *busypg; + UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist); +#ifdef DEBUG + int debug_n_again, debug_n_dirtyclean; +#endif + + vp = ap->a_vp; + ip = VTOI(vp); + fs = ip->i_lfs; + sync = (ap->a_flags & PGO_SYNCIO) != 0; + pagedaemon = (curlwp == uvm.pagedaemon_lwp); + + /* Putpages does nothing for metadata. */ + if (vp == fs->lfs_ivnode || vp->v_type != VREG) { + mutex_exit(vp->v_interlock); + return 0; + } + + /* + * If there are no pages, don't do anything. + */ + if (vp->v_uobj.uo_npages == 0) { + if (TAILQ_EMPTY(&vp->v_uobj.memq) && + (vp->v_iflag & VI_ONWORKLST) && + LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { + vp->v_iflag &= ~VI_WRMAPDIRTY; + vn_syncer_remove_from_worklist(vp); + } + mutex_exit(vp->v_interlock); + + /* Remove us from paging queue, if we were on it */ + mutex_enter(&lfs_lock); + if (ip->i_flags & IN_PAGING) { + ip->i_flags &= ~IN_PAGING; + TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain); + } + mutex_exit(&lfs_lock); + return 0; + } + + blkeof = blkroundup(fs, ip->i_size); + + /* + * Ignore requests to free pages past EOF but in the same block + * as EOF, unless the request is synchronous. (If the request is + * sync, it comes from lfs_truncate.) + * XXXUBC Make these pages look "active" so the pagedaemon won't + * XXXUBC bother us with them again. + */ + if (!sync && ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) { + origoffset = ap->a_offlo; + for (off = origoffset; off < blkeof; off += fs->lfs_bsize) { + pg = uvm_pagelookup(&vp->v_uobj, off); + KASSERT(pg != NULL); + while (pg->flags & PG_BUSY) { + pg->flags |= PG_WANTED; + UVM_UNLOCK_AND_WAIT(pg, vp->v_interlock, 0, + "lfsput2", 0); + mutex_enter(vp->v_interlock); + } + mutex_enter(&uvm_pageqlock); + uvm_pageactivate(pg); + mutex_exit(&uvm_pageqlock); + } + ap->a_offlo = blkeof; + if (ap->a_offhi > 0 && ap->a_offhi <= ap->a_offlo) { + mutex_exit(vp->v_interlock); + return 0; + } + } + + /* + * Extend page range to start and end at block boundaries. + * (For the purposes of VOP_PUTPAGES, fragments don't exist.) + */ + origoffset = ap->a_offlo; + origendoffset = ap->a_offhi; + startoffset = origoffset & ~(fs->lfs_bmask); + max_endoffset = (trunc_page(LLONG_MAX) >> fs->lfs_bshift) + << fs->lfs_bshift; + + if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) { + endoffset = max_endoffset; + origendoffset = endoffset; + } else { + origendoffset = round_page(ap->a_offhi); + endoffset = round_page(blkroundup(fs, origendoffset)); + } + + KASSERT(startoffset > 0 || endoffset >= startoffset); + if (startoffset == endoffset) { + /* Nothing to do, why were we called? */ + mutex_exit(vp->v_interlock); + DLOG((DLOG_PAGE, "lfs_putpages: startoffset = endoffset = %" + PRId64 "\n", startoffset)); + return 0; + } + + ap->a_offlo = startoffset; + ap->a_offhi = endoffset; + + /* + * If not cleaning, just send the pages through genfs_putpages + * to be returned to the pool. + */ + if (!(ap->a_flags & PGO_CLEANIT)) + return genfs_putpages(v); + + /* Set PGO_BUSYFAIL to avoid deadlocks */ + ap->a_flags |= PGO_BUSYFAIL; + + /* + * Likewise, if we are asked to clean but the pages are not + * dirty, we can just free them using genfs_putpages. + */ +#ifdef DEBUG + debug_n_dirtyclean = 0; +#endif + do { + int r; + + /* Count the number of dirty pages */ + r = check_dirty(fs, vp, startoffset, endoffset, blkeof, + ap->a_flags, 1, NULL); + if (r < 0) { + /* Pages are busy with another process */ + mutex_exit(vp->v_interlock); + return EDEADLK; + } + if (r > 0) /* Some pages are dirty */ + break; + + /* + * Sometimes pages are dirtied between the time that + * we check and the time we try to clean them. + * Instruct lfs_gop_write to return EDEADLK in this case + * so we can write them properly. + */ + ip->i_lfs_iflags |= LFSI_NO_GOP_WRITE; + r = genfs_do_putpages(vp, startoffset, endoffset, + ap->a_flags & ~PGO_SYNCIO, &busypg); + ip->i_lfs_iflags &= ~LFSI_NO_GOP_WRITE; + if (r != EDEADLK) + return r; + + /* One of the pages was busy. Start over. */ + mutex_enter(vp->v_interlock); + wait_for_page(vp, busypg, "dirtyclean"); +#ifdef DEBUG + ++debug_n_dirtyclean; +#endif + } while(1); + +#ifdef DEBUG + if (debug_n_dirtyclean > TOOMANY) + printf("lfs_putpages: dirtyclean: looping, n = %d\n", + debug_n_dirtyclean); +#endif + + /* + * Dirty and asked to clean. + * + * Pagedaemon can't actually write LFS pages; wake up + * the writer to take care of that. The writer will + * notice the pager inode queue and act on that. + * + * XXX We must drop the vp->interlock before taking the lfs_lock or we + * get a nasty deadlock with lfs_flush_pchain(). + */ + if (pagedaemon) { + mutex_exit(vp->v_interlock); + mutex_enter(&lfs_lock); + if (!(ip->i_flags & IN_PAGING)) { + ip->i_flags |= IN_PAGING; + TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip, i_lfs_pchain); + } + wakeup(&lfs_writer_daemon); + mutex_exit(&lfs_lock); + preempt(); + return EWOULDBLOCK; + } + + /* + * If this is a file created in a recent dirop, we can't flush its + * inode until the dirop is complete. Drain dirops, then flush the + * filesystem (taking care of any other pending dirops while we're + * at it). + */ + if ((ap->a_flags & (PGO_CLEANIT|PGO_LOCKED)) == PGO_CLEANIT && + (vp->v_uflag & VU_DIROP)) { + int locked; + + DLOG((DLOG_PAGE, "lfs_putpages: flushing VU_DIROP\n")); + /* XXX VOP_ISLOCKED() may not be used for lock decisions. */ + locked = (VOP_ISLOCKED(vp) == LK_EXCLUSIVE); + mutex_exit(vp->v_interlock); + lfs_writer_enter(fs, "ppdirop"); + if (locked) + VOP_UNLOCK(vp); /* XXX why? */ + + mutex_enter(&lfs_lock); + lfs_flush_fs(fs, sync ? SEGM_SYNC : 0); + mutex_exit(&lfs_lock); + + if (locked) + VOP_LOCK(vp, LK_EXCLUSIVE); + mutex_enter(vp->v_interlock); + lfs_writer_leave(fs); + + /* XXX the flush should have taken care of this one too! */ + } + + /* + * This is it. We are going to write some pages. From here on + * down it's all just mechanics. + * + * Don't let genfs_putpages wait; lfs_segunlock will wait for us. + */ + ap->a_flags &= ~PGO_SYNCIO; + + /* + * If we've already got the seglock, flush the node and return. + * The FIP has already been set up for us by lfs_writefile, + * and FIP cleanup and lfs_updatemeta will also be done there, + * unless genfs_putpages returns EDEADLK; then we must flush + * what we have, and correct FIP and segment header accounting. + */ + get_seglock: + /* + * If we are not called with the segment locked, lock it. + * Account for a new FIP in the segment header, and set sp->vp. + * (This should duplicate the setup at the top of lfs_writefile().) + */ + seglocked = (ap->a_flags & PGO_LOCKED) != 0; + if (!seglocked) { + mutex_exit(vp->v_interlock); + error = lfs_seglock(fs, SEGM_PROT | (sync ? SEGM_SYNC : 0)); + if (error != 0) + return error; + mutex_enter(vp->v_interlock); + lfs_acquire_finfo(fs, ip->i_number, ip->i_gen); + } + sp = fs->lfs_sp; + KASSERT(sp->vp == NULL); + sp->vp = vp; + + /* + * Ensure that the partial segment is marked SS_DIROP if this + * vnode is a DIROP. + */ + if (!seglocked && vp->v_uflag & VU_DIROP) + ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT); + + /* + * Loop over genfs_putpages until all pages are gathered. + * genfs_putpages() drops the interlock, so reacquire it if necessary. + * Whenever we lose the interlock we have to rerun check_dirty, as + * well, since more pages might have been dirtied in our absence. + */ +#ifdef DEBUG + debug_n_again = 0; +#endif + do { + busypg = NULL; + if (check_dirty(fs, vp, startoffset, endoffset, blkeof, + ap->a_flags, 0, &busypg) < 0) { + mutex_exit(vp->v_interlock); + + mutex_enter(vp->v_interlock); + write_and_wait(fs, vp, busypg, seglocked, NULL); + if (!seglocked) { + mutex_exit(vp->v_interlock); + lfs_release_finfo(fs); + lfs_segunlock(fs); + mutex_enter(vp->v_interlock); + } + sp->vp = NULL; + goto get_seglock; + } + + busypg = NULL; + error = genfs_do_putpages(vp, startoffset, endoffset, + ap->a_flags, &busypg); + + if (error == EDEADLK || error == EAGAIN) { + DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned" + " %d ino %d off %x (seg %d)\n", error, + ip->i_number, fs->lfs_offset, + dtosn(fs, fs->lfs_offset))); + + mutex_enter(vp->v_interlock); + write_and_wait(fs, vp, busypg, seglocked, "again"); + } +#ifdef DEBUG + ++debug_n_again; +#endif + } while (error == EDEADLK); +#ifdef DEBUG + if (debug_n_again > TOOMANY) + printf("lfs_putpages: again: looping, n = %d\n", debug_n_again); +#endif + + KASSERT(sp != NULL && sp->vp == vp); + if (!seglocked) { + sp->vp = NULL; + + /* Write indirect blocks as well */ + lfs_gather(fs, fs->lfs_sp, vp, lfs_match_indir); + lfs_gather(fs, fs->lfs_sp, vp, lfs_match_dindir); + lfs_gather(fs, fs->lfs_sp, vp, lfs_match_tindir); + + KASSERT(sp->vp == NULL); + sp->vp = vp; + } + + /* + * Blocks are now gathered into a segment waiting to be written. + * All that's left to do is update metadata, and write them. + */ + lfs_updatemeta(sp); + KASSERT(sp->vp == vp); + sp->vp = NULL; + + /* + * If we were called from lfs_writefile, we don't need to clean up + * the FIP or unlock the segment lock. We're done. + */ + if (seglocked) + return error; + + /* Clean up FIP and send it to disk. */ + lfs_release_finfo(fs); + lfs_writeseg(fs, fs->lfs_sp); + + /* + * Remove us from paging queue if we wrote all our pages. + */ + if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) { + mutex_enter(&lfs_lock); + if (ip->i_flags & IN_PAGING) { + ip->i_flags &= ~IN_PAGING; + TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain); + } + mutex_exit(&lfs_lock); + } + + /* + * XXX - with the malloc/copy writeseg, the pages are freed by now + * even if we don't wait (e.g. if we hold a nested lock). This + * will not be true if we stop using malloc/copy. + */ + KASSERT(fs->lfs_sp->seg_flags & SEGM_PROT); + lfs_segunlock(fs); + + /* + * Wait for v_numoutput to drop to zero. The seglock should + * take care of this, but there is a slight possibility that + * aiodoned might not have got around to our buffers yet. + */ + if (sync) { + mutex_enter(vp->v_interlock); + while (vp->v_numoutput > 0) { + DLOG((DLOG_PAGE, "lfs_putpages: ino %d sleeping on" + " num %d\n", ip->i_number, vp->v_numoutput)); + cv_wait(&vp->v_cv, vp->v_interlock); + } + mutex_exit(vp->v_interlock); + } + return error; +} + +/* + * Return the last logical file offset that should be written for this file + * if we're doing a write that ends at "size". If writing, we need to know + * about sizes on disk, i.e. fragments if there are any; if reading, we need + * to know about entire blocks. + */ +void +lfs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags) +{ + struct inode *ip = VTOI(vp); + struct lfs *fs = ip->i_lfs; + daddr_t olbn, nlbn; + + olbn = lblkno(fs, ip->i_size); + nlbn = lblkno(fs, size); + if (!(flags & GOP_SIZE_MEM) && nlbn < NDADDR && olbn <= nlbn) { + *eobp = fragroundup(fs, size); + } else { + *eobp = blkroundup(fs, size); + } +} + +#ifdef DEBUG +void lfs_dump_vop(void *); + +void +lfs_dump_vop(void *v) +{ + struct vop_putpages_args /* { + struct vnode *a_vp; + voff_t a_offlo; + voff_t a_offhi; + int a_flags; + } */ *ap = v; + +#ifdef DDB + vfs_vnode_print(ap->a_vp, 0, printf); +#endif + lfs_dump_dinode(VTOI(ap->a_vp)->i_din.ffs1_din); +} +#endif + +int +lfs_mmap(void *v) +{ + struct vop_mmap_args /* { + const struct vnodeop_desc *a_desc; + struct vnode *a_vp; + vm_prot_t a_prot; + kauth_cred_t a_cred; + } */ *ap = v; + + if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM) + return EOPNOTSUPP; + return ufs_mmap(v); +} diff --git a/sys/ufs/mfs/Makefile b/sys/ufs/mfs/Makefile new file mode 100644 index 000000000..c0fdca997 --- /dev/null +++ b/sys/ufs/mfs/Makefile @@ -0,0 +1,7 @@ +# $NetBSD: Makefile,v 1.2 1999/07/03 18:40:32 thorpej Exp $ + +INCSDIR= /usr/include/ufs/mfs + +INCS= mfs_extern.h mfsnode.h + +.include diff --git a/include/ufs/mfs/mfs_extern.h b/sys/ufs/mfs/mfs_extern.h similarity index 100% rename from include/ufs/mfs/mfs_extern.h rename to sys/ufs/mfs/mfs_extern.h diff --git a/sys/ufs/mfs/mfs_miniroot.c b/sys/ufs/mfs/mfs_miniroot.c new file mode 100644 index 000000000..cfd4a03b6 --- /dev/null +++ b/sys/ufs/mfs/mfs_miniroot.c @@ -0,0 +1,68 @@ +/* $NetBSD: mfs_miniroot.c,v 1.1 2010/03/02 17:20:02 pooka Exp $ */ + +/* + * Copyright (c) 1989, 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mfs_vfsops.c 8.11 (Berkeley) 6/19/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: mfs_miniroot.c,v 1.1 2010/03/02 17:20:02 pooka Exp $"); + +#include + +#include +#include + +void * mfs_rootbase; /* address of mini-root in kernel virtual memory */ +u_long mfs_rootsize; /* size of mini-root in bytes */ + +/* + * This is called early in boot to set the base address and size + * of the mini-root. + */ +int +mfs_initminiroot(void *base) +{ + struct fs *fs = (struct fs *)((char *)base + SBLOCK_UFS1); + static bool inited = false; + + if (inited) + panic("mfs_initminiroot() called more than once"); + inited = true; + + /* check for valid super block */ + if (fs->fs_magic != FS_UFS1_MAGIC || fs->fs_bsize > MAXBSIZE || + fs->fs_bsize < sizeof(struct fs)) + return (0); + rootfstype = MOUNT_MFS; + mfs_rootbase = base; + mfs_rootsize = fs->fs_fsize * fs->fs_size; + rootdev = makedev(255, 0); + return (mfs_rootsize); +} diff --git a/sys/ufs/mfs/mfs_vfsops.c b/sys/ufs/mfs/mfs_vfsops.c new file mode 100644 index 000000000..292998dc0 --- /dev/null +++ b/sys/ufs/mfs/mfs_vfsops.c @@ -0,0 +1,444 @@ +/* $NetBSD: mfs_vfsops.c,v 1.103 2011/06/12 03:36:01 rmind Exp $ */ + +/* + * Copyright (c) 1989, 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mfs_vfsops.c 8.11 (Berkeley) 6/19/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: mfs_vfsops.c,v 1.103 2011/06/12 03:36:01 rmind Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_compat_netbsd.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +MODULE(MODULE_CLASS_VFS, mfs, "ffs"); + +kmutex_t mfs_lock; /* global lock */ + +/* used for building internal dev_t, minor == 0 reserved for miniroot */ +static int mfs_minor = 1; +static int mfs_initcnt; + +extern int (**mfs_vnodeop_p)(void *); + +static struct sysctllog *mfs_sysctl_log; + +/* + * mfs vfs operations. + */ + +extern const struct vnodeopv_desc mfs_vnodeop_opv_desc; + +const struct vnodeopv_desc * const mfs_vnodeopv_descs[] = { + &mfs_vnodeop_opv_desc, + NULL, +}; + +struct vfsops mfs_vfsops = { + MOUNT_MFS, + sizeof (struct mfs_args), + mfs_mount, + mfs_start, + ffs_unmount, + ufs_root, + ufs_quotactl, + mfs_statvfs, + ffs_sync, + ffs_vget, + ffs_fhtovp, + ffs_vptofh, + mfs_init, + mfs_reinit, + mfs_done, + NULL, + (int (*)(struct mount *, struct vnode *, struct timespec *)) eopnotsupp, + vfs_stdextattrctl, + (void *)eopnotsupp, /* vfs_suspendctl */ + genfs_renamelock_enter, + genfs_renamelock_exit, + (void *)eopnotsupp, + mfs_vnodeopv_descs, + 0, + { NULL, NULL }, +}; + +static int +mfs_modcmd(modcmd_t cmd, void *arg) +{ + int error; + + switch (cmd) { + case MODULE_CMD_INIT: + error = vfs_attach(&mfs_vfsops); + if (error != 0) + break; + sysctl_createv(&mfs_sysctl_log, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "vfs", NULL, + NULL, 0, NULL, 0, + CTL_VFS, CTL_EOL); + sysctl_createv(&mfs_sysctl_log, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_ALIAS, + CTLTYPE_NODE, "mfs", + SYSCTL_DESCR("Memory based file system"), + NULL, 1, NULL, 0, + CTL_VFS, 3, CTL_EOL); + /* + * XXX the "1" and the "3" above could be dynamic, thereby + * eliminating one more instance of the "number to vfs" + * mapping problem, but they are in order as taken from + * sys/mount.h + */ + break; + case MODULE_CMD_FINI: + error = vfs_detach(&mfs_vfsops); + if (error != 0) + break; + sysctl_teardown(&mfs_sysctl_log); + break; + default: + error = ENOTTY; + break; + } + + return (error); +} + +/* + * Memory based filesystem initialization. + */ +void +mfs_init(void) +{ + + if (mfs_initcnt++ == 0) { + mutex_init(&mfs_lock, MUTEX_DEFAULT, IPL_NONE); + ffs_init(); + } +} + +void +mfs_reinit(void) +{ + + ffs_reinit(); +} + +void +mfs_done(void) +{ + + if (--mfs_initcnt == 0) { + ffs_done(); + mutex_destroy(&mfs_lock); + } +} + +/* + * Called by main() when mfs is going to be mounted as root. + */ + +int +mfs_mountroot(void) +{ + struct fs *fs; + struct mount *mp; + struct lwp *l = curlwp; /* XXX */ + struct ufsmount *ump; + struct mfsnode *mfsp; + int error = 0; + + if ((error = vfs_rootmountalloc(MOUNT_MFS, "mfs_root", &mp))) { + vrele(rootvp); + return (error); + } + + mfsp = kmem_alloc(sizeof(*mfsp), KM_SLEEP); + rootvp->v_data = mfsp; + rootvp->v_op = mfs_vnodeop_p; + rootvp->v_tag = VT_MFS; + mfsp->mfs_baseoff = mfs_rootbase; + mfsp->mfs_size = mfs_rootsize; + mfsp->mfs_vnode = rootvp; + mfsp->mfs_proc = NULL; /* indicate kernel space */ + mfsp->mfs_shutdown = 0; + cv_init(&mfsp->mfs_cv, "mfs"); + mfsp->mfs_refcnt = 1; + bufq_alloc(&mfsp->mfs_buflist, "fcfs", 0); + if ((error = ffs_mountfs(rootvp, mp, l)) != 0) { + vfs_unbusy(mp, false, NULL); + bufq_free(mfsp->mfs_buflist); + vfs_destroy(mp); + kmem_free(mfsp, sizeof(*mfsp)); + return (error); + } + mutex_enter(&mountlist_lock); + CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mutex_exit(&mountlist_lock); + mp->mnt_vnodecovered = NULLVP; + ump = VFSTOUFS(mp); + fs = ump->um_fs; + (void) copystr(mp->mnt_stat.f_mntonname, fs->fs_fsmnt, MNAMELEN - 1, 0); + (void)ffs_statvfs(mp, &mp->mnt_stat); + vfs_unbusy(mp, false, NULL); + return (0); +} + +/* + * VFS Operations. + * + * mount system call + */ +/* ARGSUSED */ +int +mfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) +{ + struct lwp *l = curlwp; + struct vnode *devvp; + struct mfs_args *args = data; + struct ufsmount *ump; + struct fs *fs; + struct mfsnode *mfsp; + struct proc *p; + int flags, error = 0; + + if (*data_len < sizeof *args) + return EINVAL; + + p = l->l_proc; + if (mp->mnt_flag & MNT_GETARGS) { + struct vnode *vp; + + ump = VFSTOUFS(mp); + if (ump == NULL) + return EIO; + + vp = ump->um_devvp; + if (vp == NULL) + return EIO; + + mfsp = VTOMFS(vp); + if (mfsp == NULL) + return EIO; + + args->fspec = NULL; + args->base = mfsp->mfs_baseoff; + args->size = mfsp->mfs_size; + *data_len = sizeof *args; + return 0; + } + /* + * XXX turn off async to avoid hangs when writing lots of data. + * the problem is that MFS needs to allocate pages to clean pages, + * so if we wait until the last minute to clean pages then there + * may not be any pages available to do the cleaning. + * ... and since the default partially-synchronous mode turns out + * to not be sufficient under heavy load, make it full synchronous. + */ + mp->mnt_flag &= ~MNT_ASYNC; + mp->mnt_flag |= MNT_SYNCHRONOUS; + + /* + * If updating, check whether changing from read-only to + * read/write; if there is no device name, that's all we do. + */ + if (mp->mnt_flag & MNT_UPDATE) { + ump = VFSTOUFS(mp); + fs = ump->um_fs; + if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { + flags = WRITECLOSE; + if (mp->mnt_flag & MNT_FORCE) + flags |= FORCECLOSE; + error = ffs_flushfiles(mp, flags, l); + if (error) + return (error); + } + if (fs->fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) + fs->fs_ronly = 0; + if (args->fspec == NULL) + return EINVAL; + return (0); + } + error = getnewvnode(VT_MFS, NULL, mfs_vnodeop_p, NULL, &devvp); + if (error) + return (error); + devvp->v_vflag |= VV_MPSAFE; + devvp->v_type = VBLK; + spec_node_init(devvp, makedev(255, mfs_minor)); + mfs_minor++; + mfsp = kmem_alloc(sizeof(*mfsp), KM_SLEEP); + devvp->v_data = mfsp; + mfsp->mfs_baseoff = args->base; + mfsp->mfs_size = args->size; + mfsp->mfs_vnode = devvp; + mfsp->mfs_proc = p; + mfsp->mfs_shutdown = 0; + cv_init(&mfsp->mfs_cv, "mfsidl"); + mfsp->mfs_refcnt = 1; + bufq_alloc(&mfsp->mfs_buflist, "fcfs", 0); + if ((error = ffs_mountfs(devvp, mp, l)) != 0) { + mfsp->mfs_shutdown = 1; + vrele(devvp); + return (error); + } + ump = VFSTOUFS(mp); + fs = ump->um_fs; + error = set_statvfs_info(path, UIO_USERSPACE, args->fspec, + UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); + if (error) + return error; + (void)strncpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, + sizeof(fs->fs_fsmnt)); + fs->fs_fsmnt[sizeof(fs->fs_fsmnt) - 1] = '\0'; + /* XXX: cleanup on error */ + return 0; +} + +/* + * Used to grab the process and keep it in the kernel to service + * memory filesystem I/O requests. + * + * Loop servicing I/O requests. + * Copy the requested data into or out of the memory filesystem + * address space. + */ +/* ARGSUSED */ +int +mfs_start(struct mount *mp, int flags) +{ + struct vnode *vp; + struct mfsnode *mfsp; + struct proc *p; + struct buf *bp; + void *base; + int sleepreturn = 0, refcnt, error; + ksiginfoq_t kq; + + /* + * Ensure that file system is still mounted when getting mfsnode. + * Add a reference to the mfsnode to prevent it disappearing in + * this routine. + */ + if ((error = vfs_busy(mp, NULL)) != 0) + return error; + vp = VFSTOUFS(mp)->um_devvp; + mfsp = VTOMFS(vp); + mutex_enter(&mfs_lock); + mfsp->mfs_refcnt++; + mutex_exit(&mfs_lock); + vfs_unbusy(mp, false, NULL); + + base = mfsp->mfs_baseoff; + mutex_enter(&mfs_lock); + while (mfsp->mfs_shutdown != 1) { + while ((bp = bufq_get(mfsp->mfs_buflist)) != NULL) { + mutex_exit(&mfs_lock); + mfs_doio(bp, base); + mutex_enter(&mfs_lock); + } + /* + * If a non-ignored signal is received, try to unmount. + * If that fails, or the filesystem is already in the + * process of being unmounted, clear the signal (it has been + * "processed"), otherwise we will loop here, as tsleep + * will always return EINTR/ERESTART. + */ + if (sleepreturn != 0) { + mutex_exit(&mfs_lock); + if (dounmount(mp, 0, curlwp) != 0) { + p = curproc; + ksiginfo_queue_init(&kq); + mutex_enter(p->p_lock); + sigclearall(p, NULL, &kq); + mutex_exit(p->p_lock); + ksiginfo_queue_drain(&kq); + } + sleepreturn = 0; + mutex_enter(&mfs_lock); + continue; + } + + sleepreturn = cv_wait_sig(&mfsp->mfs_cv, &mfs_lock); + } + KASSERT(bufq_peek(mfsp->mfs_buflist) == NULL); + refcnt = --mfsp->mfs_refcnt; + mutex_exit(&mfs_lock); + if (refcnt == 0) { + bufq_free(mfsp->mfs_buflist); + cv_destroy(&mfsp->mfs_cv); + kmem_free(mfsp, sizeof(*mfsp)); + } + return (sleepreturn); +} + +/* + * Get file system statistics. + */ +int +mfs_statvfs(struct mount *mp, struct statvfs *sbp) +{ + int error; + + error = ffs_statvfs(mp, sbp); + if (error) + return error; + (void)strncpy(sbp->f_fstypename, mp->mnt_op->vfs_name, + sizeof(sbp->f_fstypename)); + sbp->f_fstypename[sizeof(sbp->f_fstypename) - 1] = '\0'; + return 0; +} diff --git a/sys/ufs/mfs/mfs_vnops.c b/sys/ufs/mfs/mfs_vnops.c new file mode 100644 index 000000000..53a2c5874 --- /dev/null +++ b/sys/ufs/mfs/mfs_vnops.c @@ -0,0 +1,327 @@ +/* $NetBSD: mfs_vnops.c,v 1.54 2010/06/24 13:03:19 hannken Exp $ */ + +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mfs_vnops.c 8.11 (Berkeley) 5/22/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: mfs_vnops.c,v 1.54 2010/06/24 13:03:19 hannken Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +/* + * mfs vnode operations. + */ +int (**mfs_vnodeop_p)(void *); +const struct vnodeopv_entry_desc mfs_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, mfs_lookup }, /* lookup */ + { &vop_create_desc, mfs_create }, /* create */ + { &vop_mknod_desc, mfs_mknod }, /* mknod */ + { &vop_open_desc, mfs_open }, /* open */ + { &vop_close_desc, mfs_close }, /* close */ + { &vop_access_desc, mfs_access }, /* access */ + { &vop_getattr_desc, mfs_getattr }, /* getattr */ + { &vop_setattr_desc, mfs_setattr }, /* setattr */ + { &vop_read_desc, mfs_read }, /* read */ + { &vop_write_desc, mfs_write }, /* write */ + { &vop_ioctl_desc, mfs_ioctl }, /* ioctl */ + { &vop_poll_desc, mfs_poll }, /* poll */ + { &vop_revoke_desc, mfs_revoke }, /* revoke */ + { &vop_mmap_desc, mfs_mmap }, /* mmap */ + { &vop_fsync_desc, spec_fsync }, /* fsync */ + { &vop_seek_desc, mfs_seek }, /* seek */ + { &vop_remove_desc, mfs_remove }, /* remove */ + { &vop_link_desc, mfs_link }, /* link */ + { &vop_rename_desc, mfs_rename }, /* rename */ + { &vop_mkdir_desc, mfs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, mfs_rmdir }, /* rmdir */ + { &vop_symlink_desc, mfs_symlink }, /* symlink */ + { &vop_readdir_desc, mfs_readdir }, /* readdir */ + { &vop_readlink_desc, mfs_readlink }, /* readlink */ + { &vop_abortop_desc, mfs_abortop }, /* abortop */ + { &vop_inactive_desc, mfs_inactive }, /* inactive */ + { &vop_reclaim_desc, mfs_reclaim }, /* reclaim */ + { &vop_lock_desc, genfs_nolock }, /* lock */ + { &vop_unlock_desc, genfs_nounlock }, /* unlock */ + { &vop_bmap_desc, mfs_bmap }, /* bmap */ + { &vop_strategy_desc, mfs_strategy }, /* strategy */ + { &vop_print_desc, mfs_print }, /* print */ + { &vop_islocked_desc, mfs_islocked }, /* islocked */ + { &vop_pathconf_desc, mfs_pathconf }, /* pathconf */ + { &vop_advlock_desc, mfs_advlock }, /* advlock */ + { &vop_bwrite_desc, mfs_bwrite }, /* bwrite */ + { &vop_putpages_desc, mfs_putpages }, /* putpages */ + { NULL, NULL } +}; +const struct vnodeopv_desc mfs_vnodeop_opv_desc = + { &mfs_vnodeop_p, mfs_vnodeop_entries }; + +/* + * Vnode Operations. + * + * Open called to allow memory filesystem to initialize and + * validate before actual IO. Record our process identifier + * so we can tell when we are doing I/O to ourself. + */ +/* ARGSUSED */ +int +mfs_open(void *v) +{ + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + kauth_cred_t a_cred; + } */ *ap = v; + + if (ap->a_vp->v_type != VBLK) { + panic("mfs_ioctl not VBLK"); + /* NOTREACHED */ + } + return (0); +} + +/* + * Pass I/O requests to the memory filesystem process. + */ +int +mfs_strategy(void *v) +{ + struct vop_strategy_args /* { + struct vnode *a_vp; + struct buf *a_bp; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct buf *bp = ap->a_bp; + struct mfsnode *mfsp; + + if (vp->v_type != VBLK || vp->v_usecount == 0) + panic("mfs_strategy: bad dev"); + mfsp = VTOMFS(vp); + /* check for mini-root access */ + if (mfsp->mfs_proc == NULL) { + void *base; + + base = (char *)mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT); + if (bp->b_flags & B_READ) + memcpy(bp->b_data, base, bp->b_bcount); + else + memcpy(base, bp->b_data, bp->b_bcount); + bp->b_resid = 0; + biodone(bp); + } else if (mfsp->mfs_proc == curproc) { + mfs_doio(bp, mfsp->mfs_baseoff); + } else if (doing_shutdown) { + /* + * bitbucket I/O during shutdown. + * Note that reads should *not* happen here, but.. + */ + if (bp->b_flags & B_READ) + printf("warning: mfs read during shutdown\n"); + bp->b_resid = 0; + biodone(bp); + } else { + mutex_enter(&mfs_lock); + bufq_put(mfsp->mfs_buflist, bp); + cv_broadcast(&mfsp->mfs_cv); + mutex_exit(&mfs_lock); + } + return (0); +} + +/* + * Memory file system I/O. + */ +void +mfs_doio(struct buf *bp, void *base) +{ + + base = (char *)base + (bp->b_blkno << DEV_BSHIFT); + if (bp->b_flags & B_READ) + bp->b_error = copyin(base, bp->b_data, bp->b_bcount); + else + bp->b_error = copyout(bp->b_data, base, bp->b_bcount); + if (bp->b_error == 0) + bp->b_resid = 0; + biodone(bp); +} + +/* + * This is a noop, simply returning what one has been given. + */ +int +mfs_bmap(void *v) +{ + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap = v; + + if (ap->a_vpp != NULL) + *ap->a_vpp = ap->a_vp; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + if (ap->a_runp != NULL) + *ap->a_runp = 0; + return (0); +} + +/* + * Memory filesystem close routine + */ +/* ARGSUSED */ +int +mfs_close(void *v) +{ + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct mfsnode *mfsp = VTOMFS(vp); + struct buf *bp; + int error; + + /* + * Finish any pending I/O requests. + */ + mutex_enter(&mfs_lock); + while ((bp = bufq_get(mfsp->mfs_buflist)) != NULL) { + mutex_exit(&mfs_lock); + mfs_doio(bp, mfsp->mfs_baseoff); + mutex_enter(&mfs_lock); + } + mutex_exit(&mfs_lock); + /* + * On last close of a memory filesystem + * we must invalidate any in core blocks, so that + * we can, free up its vnode. + */ + if ((error = vinvalbuf(vp, V_SAVE, ap->a_cred, curlwp, 0, 0)) != 0) + return (error); + /* + * There should be no way to have any more uses of this + * vnode, so if we find any other uses, it is a panic. + */ + if (bufq_peek(mfsp->mfs_buflist) != NULL) + panic("mfs_close"); + /* + * Send a request to the filesystem server to exit. + */ + mutex_enter(&mfs_lock); + mfsp->mfs_shutdown = 1; + cv_broadcast(&mfsp->mfs_cv); + mutex_exit(&mfs_lock); + return (0); +} + +/* + * Memory filesystem inactive routine + */ +/* ARGSUSED */ +int +mfs_inactive(void *v) +{ + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct mfsnode *mfsp = VTOMFS(vp); + + if (bufq_peek(mfsp->mfs_buflist) != NULL) + panic("mfs_inactive: not inactive (mfs_buflist %p)", + bufq_peek(mfsp->mfs_buflist)); + VOP_UNLOCK(vp); + return (0); +} + +/* + * Reclaim a memory filesystem devvp so that it can be reused. + */ +int +mfs_reclaim(void *v) +{ + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct mfsnode *mfsp = VTOMFS(vp); + int refcnt; + + mutex_enter(&mfs_lock); + vp->v_data = NULL; + refcnt = --mfsp->mfs_refcnt; + mutex_exit(&mfs_lock); + + if (refcnt == 0) { + bufq_free(mfsp->mfs_buflist); + cv_destroy(&mfsp->mfs_cv); + kmem_free(mfsp, sizeof(*mfsp)); + } + + return (0); +} + +/* + * Print out the contents of an mfsnode. + */ +int +mfs_print(void *v) +{ + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap = v; + struct mfsnode *mfsp = VTOMFS(ap->a_vp); + + printf("tag VT_MFS, pid %d, base %p, size %ld\n", + (mfsp->mfs_proc != NULL) ? mfsp->mfs_proc->p_pid : 0, + mfsp->mfs_baseoff, mfsp->mfs_size); + return (0); +} diff --git a/include/ufs/mfs/mfsnode.h b/sys/ufs/mfs/mfsnode.h similarity index 100% rename from include/ufs/mfs/mfsnode.h rename to sys/ufs/mfs/mfsnode.h diff --git a/sys/ufs/ufs/Makefile b/sys/ufs/ufs/Makefile new file mode 100644 index 000000000..6f08db609 --- /dev/null +++ b/sys/ufs/ufs/Makefile @@ -0,0 +1,8 @@ +# $NetBSD: Makefile,v 1.7 2011/03/06 17:08:39 bouyer Exp $ + +INCSDIR= /usr/include/ufs/ufs + +INCS= dinode.h dir.h extattr.h inode.h quota.h quota1.h quota2.h \ + ufs_bswap.h ufs_extern.h ufs_wapbl.h ufsmount.h + +.include diff --git a/include/ufs/ufs/dinode.h b/sys/ufs/ufs/dinode.h similarity index 100% rename from include/ufs/ufs/dinode.h rename to sys/ufs/ufs/dinode.h diff --git a/include/ufs/ufs/dir.h b/sys/ufs/ufs/dir.h similarity index 100% rename from include/ufs/ufs/dir.h rename to sys/ufs/ufs/dir.h diff --git a/include/ufs/ufs/dirhash.h b/sys/ufs/ufs/dirhash.h similarity index 100% rename from include/ufs/ufs/dirhash.h rename to sys/ufs/ufs/dirhash.h diff --git a/include/ufs/ufs/extattr.h b/sys/ufs/ufs/extattr.h similarity index 100% rename from include/ufs/ufs/extattr.h rename to sys/ufs/ufs/extattr.h diff --git a/include/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h similarity index 100% rename from include/ufs/ufs/inode.h rename to sys/ufs/ufs/inode.h diff --git a/include/ufs/ufs/quota.h b/sys/ufs/ufs/quota.h similarity index 100% rename from include/ufs/ufs/quota.h rename to sys/ufs/ufs/quota.h diff --git a/include/ufs/ufs/quota1.h b/sys/ufs/ufs/quota1.h similarity index 100% rename from include/ufs/ufs/quota1.h rename to sys/ufs/ufs/quota1.h diff --git a/sys/ufs/ufs/quota1_subr.c b/sys/ufs/ufs/quota1_subr.c new file mode 100644 index 000000000..ff6a06c92 --- /dev/null +++ b/sys/ufs/ufs/quota1_subr.c @@ -0,0 +1,95 @@ +/* $NetBSD: quota1_subr.c,v 1.6 2011/11/25 16:55:05 dholland Exp $ */ +/*- + * Copyright (c) 2010 Manuel Bouyer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: quota1_subr.c,v 1.6 2011/11/25 16:55:05 dholland Exp $"); + +#include +#include + +#include +#include +#include + +static uint64_t +dqblk2q2e_limit(uint32_t lim) +{ + if (lim == 0) + return UQUAD_MAX; + else + return (lim - 1); +} + +static uint32_t +q2e2dqblk_limit(uint64_t lim) +{ + if (lim == UQUAD_MAX) + return 0; + else + return (lim + 1); +} + +void +dqblk_to_quotaval(const struct dqblk *dqblk, struct quotaval *qv) +{ + /* XXX is qv_grace getting handled correctly? */ + + qv[QUOTA_LIMIT_BLOCK].qv_hardlimit = + dqblk2q2e_limit(dqblk->dqb_bhardlimit); + qv[QUOTA_LIMIT_BLOCK].qv_softlimit = + dqblk2q2e_limit(dqblk->dqb_bsoftlimit); + qv[QUOTA_LIMIT_BLOCK].qv_usage = dqblk->dqb_curblocks; + qv[QUOTA_LIMIT_BLOCK].qv_expiretime = dqblk->dqb_btime; + + qv[QUOTA_LIMIT_FILE].qv_hardlimit = + dqblk2q2e_limit(dqblk->dqb_ihardlimit); + qv[QUOTA_LIMIT_FILE].qv_softlimit = + dqblk2q2e_limit(dqblk->dqb_isoftlimit); + qv[QUOTA_LIMIT_FILE].qv_usage = dqblk->dqb_curinodes; + qv[QUOTA_LIMIT_FILE].qv_expiretime = dqblk->dqb_itime; +} + +void +quotaval_to_dqblk(const struct quotaval *qv, struct dqblk *dqblk) +{ + /* XXX is qv_grace getting handled correctly? */ + + dqblk->dqb_bhardlimit = + q2e2dqblk_limit(qv[QUOTA_LIMIT_BLOCK].qv_hardlimit); + dqblk->dqb_bsoftlimit = + q2e2dqblk_limit(qv[QUOTA_LIMIT_BLOCK].qv_softlimit); + dqblk->dqb_curblocks = qv[QUOTA_LIMIT_BLOCK].qv_usage; + dqblk->dqb_btime = qv[QUOTA_LIMIT_BLOCK].qv_expiretime; + + dqblk->dqb_ihardlimit = + q2e2dqblk_limit(qv[QUOTA_LIMIT_FILE].qv_hardlimit); + dqblk->dqb_isoftlimit = + q2e2dqblk_limit(qv[QUOTA_LIMIT_FILE].qv_softlimit); + dqblk->dqb_curinodes = qv[QUOTA_LIMIT_FILE].qv_usage; + dqblk->dqb_itime = qv[QUOTA_LIMIT_FILE].qv_expiretime; +} + diff --git a/include/ufs/ufs/quota2.h b/sys/ufs/ufs/quota2.h similarity index 100% rename from include/ufs/ufs/quota2.h rename to sys/ufs/ufs/quota2.h diff --git a/sys/ufs/ufs/quota2_subr.c b/sys/ufs/ufs/quota2_subr.c new file mode 100644 index 000000000..f91007f1b --- /dev/null +++ b/sys/ufs/ufs/quota2_subr.c @@ -0,0 +1,108 @@ +/* $NetBSD: quota2_subr.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $ */ +/*- + * Copyright (c) 2010 Manuel Bouyer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: quota2_subr.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $"); + +#include +#include + +#include +#include +#include +#include +#include + +#ifndef _KERNEL +#include +#endif + +void +quota2_addfreeq2e(struct quota2_header *q2h, void *bp, uint64_t baseoff, + uint64_t bsize, int ns) +{ + uint64_t blkoff = baseoff % bsize; + int i, nq2e; + struct quota2_entry *q2e; + + q2e = (void *)((char *)bp + blkoff); + nq2e = (bsize - blkoff) / sizeof(*q2e); + for (i = 0; i < nq2e; i++) { + q2e[i].q2e_next = q2h->q2h_free; + q2h->q2h_free = ufs_rw64(i * sizeof(*q2e) + baseoff, ns); + } +} + +void +quota2_create_blk0(uint64_t bsize, void *bp, int q2h_hash_shift, int type, + int ns) +{ + struct quota2_header *q2h; + const int quota2_hash_size = 1 << q2h_hash_shift; + const int quota2_full_header_size = sizeof(struct quota2_header) + + sizeof(q2h->q2h_entries[0]) * quota2_hash_size; + int i; + + memset(bp, 0, bsize); + q2h = bp; + q2h->q2h_magic_number = ufs_rw32(Q2_HEAD_MAGIC, ns); + q2h->q2h_type = type; + q2h->q2h_hash_shift = q2h_hash_shift; + q2h->q2h_hash_size = ufs_rw16(quota2_hash_size, ns); + /* setup defaut entry: unlimited, 7 days grace */ + for (i = 0; i < N_QL; i++) { + q2h->q2h_defentry.q2e_val[i].q2v_hardlimit = + q2h->q2h_defentry.q2e_val[i].q2v_softlimit = + ufs_rw64(UQUAD_MAX, ns); + q2h->q2h_defentry.q2e_val[i].q2v_grace = + ufs_rw64(7ULL * 24ULL * 3600ULL, ns); + } + + /* first quota entry, after the hash table */ + quota2_addfreeq2e(q2h, bp, quota2_full_header_size, bsize, ns); +} + +void +quota2_ufs_rwq2v(const struct quota2_val *s, struct quota2_val *d, int needswap) +{ + d->q2v_hardlimit = ufs_rw64(s->q2v_hardlimit, needswap); + d->q2v_softlimit = ufs_rw64(s->q2v_softlimit, needswap); + d->q2v_cur = ufs_rw64(s->q2v_cur, needswap); + d->q2v_time = ufs_rw64(s->q2v_time, needswap); + d->q2v_grace = ufs_rw64(s->q2v_grace, needswap); +} + +void +quota2_ufs_rwq2e(const struct quota2_entry *s, struct quota2_entry *d, +int needswap) +{ + quota2_ufs_rwq2v(&s->q2e_val[QL_BLOCK], &d->q2e_val[QL_BLOCK], + needswap); + quota2_ufs_rwq2v(&s->q2e_val[QL_FILE], &d->q2e_val[QL_FILE], + needswap); + d->q2e_uid = ufs_rw32(s->q2e_uid, needswap); +} diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c new file mode 100644 index 000000000..3420e227a --- /dev/null +++ b/sys/ufs/ufs/ufs_bmap.c @@ -0,0 +1,405 @@ +/* $NetBSD: ufs_bmap.c,v 1.49 2011/03/06 17:08:39 bouyer Exp $ */ + +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_bmap.c 8.8 (Berkeley) 8/11/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ufs_bmap.c,v 1.49 2011/03/06 17:08:39 bouyer Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +static bool +ufs_issequential(const struct ufsmount *ump, daddr_t daddr0, daddr_t daddr1) +{ + + /* for ufs, blocks in a hole is not 'contiguous'. */ + if (daddr0 == 0) + return false; + + return (daddr0 + ump->um_seqinc == daddr1); +} + +/* + * Bmap converts the logical block number of a file to its physical block + * number on the disk. The conversion is done by using the logical block + * number to index into the array of block pointers described by the dinode. + */ +int +ufs_bmap(void *v) +{ + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap = v; + int error; + + /* + * Check for underlying vnode requests and ensure that logical + * to physical mapping is requested. + */ + if (ap->a_vpp != NULL) + *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; + if (ap->a_bnp == NULL) + return (0); + + fstrans_start(ap->a_vp->v_mount, FSTRANS_SHARED); + error = ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, + ap->a_runp, ufs_issequential); + fstrans_done(ap->a_vp->v_mount); + return error; +} + +/* + * Indirect blocks are now on the vnode for the file. They are given negative + * logical block numbers. Indirect blocks are addressed by the negative + * address of the first data block to which they point. Double indirect blocks + * are addressed by one less than the address of the first indirect block to + * which they point. Triple indirect blocks are addressed by one less than + * the address of the first double indirect block to which they point. + * + * ufs_bmaparray does the bmap conversion, and if requested returns the + * array of logical blocks which must be traversed to get to a block. + * Each entry contains the offset into that block that gets you to the + * next block and the disk address of the block (if it is assigned). + */ + +int +ufs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap, + int *nump, int *runp, ufs_issequential_callback_t is_sequential) +{ + struct inode *ip; + struct buf *bp, *cbp; + struct ufsmount *ump; + struct mount *mp; + struct indir a[NIADDR + 1], *xap; + daddr_t daddr; + daddr_t metalbn; + int error, maxrun = 0, num; + + ip = VTOI(vp); + mp = vp->v_mount; + ump = ip->i_ump; +#ifdef DIAGNOSTIC + if ((ap != NULL && nump == NULL) || (ap == NULL && nump != NULL)) + panic("ufs_bmaparray: invalid arguments"); +#endif + + if (runp) { + /* + * XXX + * If MAXBSIZE is the largest transfer the disks can handle, + * we probably want maxrun to be 1 block less so that we + * don't create a block larger than the device can handle. + */ + *runp = 0; + maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1; + } + + if (bn >= 0 && bn < NDADDR) { + if (nump != NULL) + *nump = 0; + if (ump->um_fstype == UFS1) + daddr = ufs_rw32(ip->i_ffs1_db[bn], + UFS_MPNEEDSWAP(ump)); + else + daddr = ufs_rw64(ip->i_ffs2_db[bn], + UFS_MPNEEDSWAP(ump)); + *bnp = blkptrtodb(ump, daddr); + /* + * Since this is FFS independent code, we are out of + * scope for the definitions of BLK_NOCOPY and + * BLK_SNAP, but we do know that they will fall in + * the range 1..um_seqinc, so we use that test and + * return a request for a zeroed out buffer if attempts + * are made to read a BLK_NOCOPY or BLK_SNAP block. + */ + if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT + && daddr > 0 && + daddr < ump->um_seqinc) { + *bnp = -1; + } else if (*bnp == 0) { + if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) + == SF_SNAPSHOT) { + *bnp = blkptrtodb(ump, bn * ump->um_seqinc); + } else { + *bnp = -1; + } + } else if (runp) { + if (ump->um_fstype == UFS1) { + for (++bn; bn < NDADDR && *runp < maxrun && + is_sequential(ump, + ufs_rw32(ip->i_ffs1_db[bn - 1], + UFS_MPNEEDSWAP(ump)), + ufs_rw32(ip->i_ffs1_db[bn], + UFS_MPNEEDSWAP(ump))); + ++bn, ++*runp); + } else { + for (++bn; bn < NDADDR && *runp < maxrun && + is_sequential(ump, + ufs_rw64(ip->i_ffs2_db[bn - 1], + UFS_MPNEEDSWAP(ump)), + ufs_rw64(ip->i_ffs2_db[bn], + UFS_MPNEEDSWAP(ump))); + ++bn, ++*runp); + } + } + return (0); + } + + xap = ap == NULL ? a : ap; + if (!nump) + nump = # + if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0) + return (error); + + num = *nump; + + /* Get disk address out of indirect block array */ + if (ump->um_fstype == UFS1) + daddr = ufs_rw32(ip->i_ffs1_ib[xap->in_off], + UFS_MPNEEDSWAP(ump)); + else + daddr = ufs_rw64(ip->i_ffs2_ib[xap->in_off], + UFS_MPNEEDSWAP(ump)); + + for (bp = NULL, ++xap; --num; ++xap) { + /* + * Exit the loop if there is no disk address assigned yet and + * the indirect block isn't in the cache, or if we were + * looking for an indirect block and we've found it. + */ + + metalbn = xap->in_lbn; + if (metalbn == bn) + break; + if (daddr == 0) { + mutex_enter(&bufcache_lock); + cbp = incore(vp, metalbn); + mutex_exit(&bufcache_lock); + if (cbp == NULL) + break; + } + + /* + * If we get here, we've either got the block in the cache + * or we have a disk address for it, go fetch it. + */ + if (bp) + brelse(bp, 0); + + xap->in_exists = 1; + bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0); + if (bp == NULL) { + + /* + * getblk() above returns NULL only iff we are + * pagedaemon. See the implementation of getblk + * for detail. + */ + + return (ENOMEM); + } + if (bp->b_oflags & (BO_DONE | BO_DELWRI)) { + trace(TR_BREADHIT, pack(vp, size), metalbn); + } +#ifdef DIAGNOSTIC + else if (!daddr) + panic("ufs_bmaparray: indirect block not in cache"); +#endif + else { + trace(TR_BREADMISS, pack(vp, size), metalbn); + bp->b_blkno = blkptrtodb(ump, daddr); + bp->b_flags |= B_READ; + BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); + VOP_STRATEGY(vp, bp); + curlwp->l_ru.ru_inblock++; /* XXX */ + if ((error = biowait(bp)) != 0) { + brelse(bp, 0); + return (error); + } + } + if (ump->um_fstype == UFS1) { + daddr = ufs_rw32(((u_int32_t *)bp->b_data)[xap->in_off], + UFS_MPNEEDSWAP(ump)); + if (num == 1 && daddr && runp) { + for (bn = xap->in_off + 1; + bn < MNINDIR(ump) && *runp < maxrun && + is_sequential(ump, + ufs_rw32(((int32_t *)bp->b_data)[bn-1], + UFS_MPNEEDSWAP(ump)), + ufs_rw32(((int32_t *)bp->b_data)[bn], + UFS_MPNEEDSWAP(ump))); + ++bn, ++*runp); + } + } else { + daddr = ufs_rw64(((u_int64_t *)bp->b_data)[xap->in_off], + UFS_MPNEEDSWAP(ump)); + if (num == 1 && daddr && runp) { + for (bn = xap->in_off + 1; + bn < MNINDIR(ump) && *runp < maxrun && + is_sequential(ump, + ufs_rw64(((int64_t *)bp->b_data)[bn-1], + UFS_MPNEEDSWAP(ump)), + ufs_rw64(((int64_t *)bp->b_data)[bn], + UFS_MPNEEDSWAP(ump))); + ++bn, ++*runp); + } + } + } + if (bp) + brelse(bp, 0); + + /* + * Since this is FFS independent code, we are out of scope for the + * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they + * will fall in the range 1..um_seqinc, so we use that test and + * return a request for a zeroed out buffer if attempts are made + * to read a BLK_NOCOPY or BLK_SNAP block. + */ + if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT + && daddr > 0 && daddr < ump->um_seqinc) { + *bnp = -1; + return (0); + } + *bnp = blkptrtodb(ump, daddr); + if (*bnp == 0) { + if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) + == SF_SNAPSHOT) { + *bnp = blkptrtodb(ump, bn * ump->um_seqinc); + } else { + *bnp = -1; + } + } + return (0); +} + +/* + * Create an array of logical block number/offset pairs which represent the + * path of indirect blocks required to access a data block. The first "pair" + * contains the logical block number of the appropriate single, double or + * triple indirect block and the offset into the inode indirect block array. + * Note, the logical block number of the inode single/double/triple indirect + * block appears twice in the array, once with the offset into the i_ffs1_ib and + * once with the offset into the page itself. + */ +int +ufs_getlbns(struct vnode *vp, daddr_t bn, struct indir *ap, int *nump) +{ + daddr_t metalbn, realbn; + struct ufsmount *ump; + int64_t blockcnt; + int lbc; + int i, numlevels, off; + + ump = VFSTOUFS(vp->v_mount); + if (nump) + *nump = 0; + numlevels = 0; + realbn = bn; + if (bn < 0) + bn = -bn; + KASSERT(bn >= NDADDR); + + /* + * Determine the number of levels of indirection. After this loop + * is done, blockcnt indicates the number of data blocks possible + * at the given level of indirection, and NIADDR - i is the number + * of levels of indirection needed to locate the requested block. + */ + + bn -= NDADDR; + for (lbc = 0, i = NIADDR;; i--, bn -= blockcnt) { + if (i == 0) + return (EFBIG); + + lbc += ump->um_lognindir; + blockcnt = (int64_t)1 << lbc; + + if (bn < blockcnt) + break; + } + + /* Calculate the address of the first meta-block. */ + metalbn = -((realbn >= 0 ? realbn : -realbn) - bn + NIADDR - i); + + /* + * At each iteration, off is the offset into the bap array which is + * an array of disk addresses at the current level of indirection. + * The logical block number and the offset in that block are stored + * into the argument array. + */ + ap->in_lbn = metalbn; + ap->in_off = off = NIADDR - i; + ap->in_exists = 0; + ap++; + for (++numlevels; i <= NIADDR; i++) { + /* If searching for a meta-data block, quit when found. */ + if (metalbn == realbn) + break; + + lbc -= ump->um_lognindir; + off = (bn >> lbc) & (MNINDIR(ump) - 1); + + ++numlevels; + ap->in_lbn = metalbn; + ap->in_off = off; + ap->in_exists = 0; + ++ap; + + metalbn -= -1 + ((int64_t)off << lbc); + } + if (nump) + *nump = numlevels; + return (0); +} diff --git a/include/ufs/ufs/ufs_bswap.h b/sys/ufs/ufs/ufs_bswap.h similarity index 100% rename from include/ufs/ufs/ufs_bswap.h rename to sys/ufs/ufs/ufs_bswap.h diff --git a/sys/ufs/ufs/ufs_dirhash.c b/sys/ufs/ufs/ufs_dirhash.c new file mode 100644 index 000000000..e893a93f0 --- /dev/null +++ b/sys/ufs/ufs/ufs_dirhash.c @@ -0,0 +1,1171 @@ +/* $NetBSD: ufs_dirhash.c,v 1.34 2009/10/05 23:48:08 rmind Exp $ */ + +/* + * Copyright (c) 2001, 2002 Ian Dowse. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.3.2.8 2004/12/08 11:54:13 dwmalone Exp $ + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ufs_dirhash.c,v 1.34 2009/10/05 23:48:08 rmind Exp $"); + +/* + * This implements a hash-based lookup scheme for UFS directories. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define WRAPINCR(val, limit) (((val) + 1 == (limit)) ? 0 : ((val) + 1)) +#define WRAPDECR(val, limit) (((val) == 0) ? ((limit) - 1) : ((val) - 1)) +#define OFSFMT(ip) ((ip)->i_ump->um_maxsymlinklen <= 0) +#define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n)) + +static u_int ufs_dirhashminblks = 5; +static u_int ufs_dirhashmaxmem = 2 * 1024 * 1024; +static u_int ufs_dirhashmem; +static u_int ufs_dirhashcheck = 0; + +static int ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen); +static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff, + int dirblksiz); +static void ufsdirhash_delslot(struct dirhash *dh, int slot); +static int ufsdirhash_findslot(struct dirhash *dh, const char *name, + int namelen, doff_t offset); +static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset, + int dirblksiz); +static int ufsdirhash_recycle(int wanted); + +static pool_cache_t ufsdirhashblk_cache; +static pool_cache_t ufsdirhash_cache; + +#define DIRHASHLIST_LOCK() mutex_enter(&ufsdirhash_lock) +#define DIRHASHLIST_UNLOCK() mutex_exit(&ufsdirhash_lock) +#define DIRHASH_LOCK(dh) mutex_enter(&(dh)->dh_lock) +#define DIRHASH_UNLOCK(dh) mutex_exit(&(dh)->dh_lock) +#define DIRHASH_BLKALLOC() \ + pool_cache_get(ufsdirhashblk_cache, PR_NOWAIT) +#define DIRHASH_BLKFREE(ptr) \ + pool_cache_put(ufsdirhashblk_cache, ptr) + +/* Dirhash list; recently-used entries are near the tail. */ +static TAILQ_HEAD(, dirhash) ufsdirhash_list; + +/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */ +static kmutex_t ufsdirhash_lock; + +static struct sysctllog *ufsdirhash_sysctl_log; + +/* + * Locking order: + * ufsdirhash_lock + * dh_lock + * + * The dh_lock mutex should be acquired either via the inode lock, or via + * ufsdirhash_lock. Only the owner of the inode may free the associated + * dirhash, but anything can steal its memory and set dh_hash to NULL. + */ + +/* + * Attempt to build up a hash table for the directory contents in + * inode 'ip'. Returns 0 on success, or -1 of the operation failed. + */ +int +ufsdirhash_build(struct inode *ip) +{ + struct dirhash *dh; + struct buf *bp = NULL; + struct direct *ep; + struct vnode *vp; + doff_t bmask, pos; + int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot; + const int needswap = UFS_MPNEEDSWAP(ip->i_ump); + int dirblksiz = ip->i_ump->um_dirblksiz; + + /* Check if we can/should use dirhash. */ + if (ip->i_dirhash == NULL) { + if (ip->i_size < (ufs_dirhashminblks * dirblksiz) || OFSFMT(ip)) + return (-1); + } else { + /* Hash exists, but sysctls could have changed. */ + if (ip->i_size < (ufs_dirhashminblks * dirblksiz) || + ufs_dirhashmem > ufs_dirhashmaxmem) { + ufsdirhash_free(ip); + return (-1); + } + /* Check if hash exists and is intact (note: unlocked read). */ + if (ip->i_dirhash->dh_hash != NULL) + return (0); + /* Free the old, recycled hash and build a new one. */ + ufsdirhash_free(ip); + } + + /* Don't hash removed directories. */ + if (ip->i_nlink == 0) + return (-1); + + vp = ip->i_vnode; + /* Allocate 50% more entries than this dir size could ever need. */ + KASSERT(ip->i_size >= dirblksiz); + nslots = ip->i_size / DIRECTSIZ(1); + nslots = (nslots * 3 + 1) / 2; + narrays = howmany(nslots, DH_NBLKOFF); + nslots = narrays * DH_NBLKOFF; + dirblocks = howmany(ip->i_size, dirblksiz); + nblocks = (dirblocks * 3 + 1) / 2; + + memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) + + narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + + nblocks * sizeof(*dh->dh_blkfree); + + while (atomic_add_int_nv(&ufs_dirhashmem, memreqd) > + ufs_dirhashmaxmem) { + atomic_add_int(&ufs_dirhashmem, -memreqd); + if (memreqd > ufs_dirhashmaxmem / 2) + return (-1); + /* Try to free some space. */ + if (ufsdirhash_recycle(memreqd) != 0) + return (-1); + else + DIRHASHLIST_UNLOCK(); + } + + /* + * Use non-blocking mallocs so that we will revert to a linear + * lookup on failure rather than potentially blocking forever. + */ + dh = pool_cache_get(ufsdirhash_cache, PR_NOWAIT); + if (dh == NULL) { + atomic_add_int(&ufs_dirhashmem, -memreqd); + return (-1); + } + memset(dh, 0, sizeof(*dh)); + mutex_init(&dh->dh_lock, MUTEX_DEFAULT, IPL_NONE); + DIRHASH_LOCK(dh); + dh->dh_hashsz = narrays * sizeof(dh->dh_hash[0]); + dh->dh_hash = kmem_zalloc(dh->dh_hashsz, KM_NOSLEEP); + dh->dh_blkfreesz = nblocks * sizeof(dh->dh_blkfree[0]); + dh->dh_blkfree = kmem_zalloc(dh->dh_blkfreesz, KM_NOSLEEP); + if (dh->dh_hash == NULL || dh->dh_blkfree == NULL) + goto fail; + for (i = 0; i < narrays; i++) { + if ((dh->dh_hash[i] = DIRHASH_BLKALLOC()) == NULL) + goto fail; + for (j = 0; j < DH_NBLKOFF; j++) + dh->dh_hash[i][j] = DIRHASH_EMPTY; + } + + /* Initialise the hash table and block statistics. */ + dh->dh_narrays = narrays; + dh->dh_hlen = nslots; + dh->dh_nblk = nblocks; + dh->dh_dirblks = dirblocks; + for (i = 0; i < dirblocks; i++) + dh->dh_blkfree[i] = dirblksiz / DIRALIGN; + for (i = 0; i < DH_NFSTATS; i++) + dh->dh_firstfree[i] = -1; + dh->dh_firstfree[DH_NFSTATS] = 0; + dh->dh_seqopt = 0; + dh->dh_seqoff = 0; + dh->dh_score = DH_SCOREINIT; + ip->i_dirhash = dh; + + bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; + pos = 0; + while (pos < ip->i_size) { + if ((curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) + != 0) { + preempt(); + } + /* If necessary, get the next directory block. */ + if ((pos & bmask) == 0) { + if (bp != NULL) + brelse(bp, 0); + if (ufs_blkatoff(vp, (off_t)pos, NULL, &bp, false) != 0) + goto fail; + } + + /* Add this entry to the hash. */ + ep = (struct direct *)((char *)bp->b_data + (pos & bmask)); + if (ep->d_reclen == 0 || ep->d_reclen > + dirblksiz - (pos & (dirblksiz - 1))) { + /* Corrupted directory. */ + brelse(bp, 0); + goto fail; + } + if (ep->d_ino != 0) { + /* Add the entry (simplified ufsdirhash_add). */ + slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen); + while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY) + slot = WRAPINCR(slot, dh->dh_hlen); + dh->dh_hused++; + DH_ENTRY(dh, slot) = pos; + ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep, needswap), + dirblksiz); + } + pos += ep->d_reclen; + } + + if (bp != NULL) + brelse(bp, 0); + DIRHASHLIST_LOCK(); + TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list); + dh->dh_onlist = 1; + DIRHASH_UNLOCK(dh); + DIRHASHLIST_UNLOCK(); + return (0); + +fail: + DIRHASH_UNLOCK(dh); + if (dh->dh_hash != NULL) { + for (i = 0; i < narrays; i++) + if (dh->dh_hash[i] != NULL) + DIRHASH_BLKFREE(dh->dh_hash[i]); + kmem_free(dh->dh_hash, dh->dh_hashsz); + } + if (dh->dh_blkfree != NULL) + kmem_free(dh->dh_blkfree, dh->dh_blkfreesz); + mutex_destroy(&dh->dh_lock); + pool_cache_put(ufsdirhash_cache, dh); + ip->i_dirhash = NULL; + atomic_add_int(&ufs_dirhashmem, -memreqd); + return (-1); +} + +/* + * Free any hash table associated with inode 'ip'. + */ +void +ufsdirhash_free(struct inode *ip) +{ + struct dirhash *dh; + int i, mem; + + if ((dh = ip->i_dirhash) == NULL) + return; + + if (dh->dh_onlist) { + DIRHASHLIST_LOCK(); + if (dh->dh_onlist) + TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); + DIRHASHLIST_UNLOCK(); + } + + /* The dirhash pointed to by 'dh' is exclusively ours now. */ + mem = sizeof(*dh); + if (dh->dh_hash != NULL) { + for (i = 0; i < dh->dh_narrays; i++) + DIRHASH_BLKFREE(dh->dh_hash[i]); + kmem_free(dh->dh_hash, dh->dh_hashsz); + kmem_free(dh->dh_blkfree, dh->dh_blkfreesz); + mem += dh->dh_hashsz; + mem += dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash); + mem += dh->dh_nblk * sizeof(*dh->dh_blkfree); + } + mutex_destroy(&dh->dh_lock); + pool_cache_put(ufsdirhash_cache, dh); + ip->i_dirhash = NULL; + + atomic_add_int(&ufs_dirhashmem, -mem); +} + +/* + * Find the offset of the specified name within the given inode. + * Returns 0 on success, ENOENT if the entry does not exist, or + * EJUSTRETURN if the caller should revert to a linear search. + * + * If successful, the directory offset is stored in *offp, and a + * pointer to a struct buf containing the entry is stored in *bpp. If + * prevoffp is non-NULL, the offset of the previous entry within + * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry + * is the first in a block, the start of the block is used). + */ +int +ufsdirhash_lookup(struct inode *ip, const char *name, int namelen, doff_t *offp, + struct buf **bpp, doff_t *prevoffp) +{ + struct dirhash *dh, *dh_next; + struct direct *dp; + struct vnode *vp; + struct buf *bp; + doff_t blkoff, bmask, offset, prevoff; + int i, slot; + const int needswap = UFS_MPNEEDSWAP(ip->i_ump); + int dirblksiz = ip->i_ump->um_dirblksiz; + + if ((dh = ip->i_dirhash) == NULL) + return (EJUSTRETURN); + + /* + * Move this dirhash towards the end of the list if it has a + * score higher than the next entry, and acquire the dh_lock. + * Optimise the case where it's already the last by performing + * an unlocked read of the TAILQ_NEXT pointer. + * + * In both cases, end up holding just dh_lock. + */ + if (TAILQ_NEXT(dh, dh_list) != NULL) { + DIRHASHLIST_LOCK(); + DIRHASH_LOCK(dh); + /* + * If the new score will be greater than that of the next + * entry, then move this entry past it. With both mutexes + * held, dh_next won't go away, but its dh_score could + * change; that's not important since it is just a hint. + */ + if (dh->dh_hash != NULL && + (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL && + dh->dh_score >= dh_next->dh_score) { + KASSERT(dh->dh_onlist); + TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); + TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh, + dh_list); + } + DIRHASHLIST_UNLOCK(); + } else { + /* Already the last, though that could change as we wait. */ + DIRHASH_LOCK(dh); + } + if (dh->dh_hash == NULL) { + DIRHASH_UNLOCK(dh); + ufsdirhash_free(ip); + return (EJUSTRETURN); + } + + /* Update the score. */ + if (dh->dh_score < DH_SCOREMAX) + dh->dh_score++; + + vp = ip->i_vnode; + bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; + blkoff = -1; + bp = NULL; +restart: + slot = ufsdirhash_hash(dh, name, namelen); + + if (dh->dh_seqopt) { + /* + * Sequential access optimisation. dh_seqoff contains the + * offset of the directory entry immediately following + * the last entry that was looked up. Check if this offset + * appears in the hash chain for the name we are looking for. + */ + for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY; + i = WRAPINCR(i, dh->dh_hlen)) + if (offset == dh->dh_seqoff) + break; + if (offset == dh->dh_seqoff) { + /* + * We found an entry with the expected offset. This + * is probably the entry we want, but if not, the + * code below will turn off seqoff and retry. + */ + slot = i; + } else + dh->dh_seqopt = 0; + } + + for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY; + slot = WRAPINCR(slot, dh->dh_hlen)) { + if (offset == DIRHASH_DEL) + continue; + + if (offset < 0 || offset >= ip->i_size) + panic("ufsdirhash_lookup: bad offset in hash array"); + if ((offset & ~bmask) != blkoff) { + if (bp != NULL) + brelse(bp, 0); + blkoff = offset & ~bmask; + if (ufs_blkatoff(vp, (off_t)blkoff, + NULL, &bp, false) != 0) { + DIRHASH_UNLOCK(dh); + return (EJUSTRETURN); + } + } + dp = (struct direct *)((char *)bp->b_data + (offset & bmask)); + if (dp->d_reclen == 0 || dp->d_reclen > + dirblksiz - (offset & (dirblksiz - 1))) { + /* Corrupted directory. */ + DIRHASH_UNLOCK(dh); + brelse(bp, 0); + return (EJUSTRETURN); + } + if (dp->d_namlen == namelen && + memcmp(dp->d_name, name, namelen) == 0) { + /* Found. Get the prev offset if needed. */ + if (prevoffp != NULL) { + if (offset & (dirblksiz - 1)) { + prevoff = ufsdirhash_getprev(dp, + offset, dirblksiz); + if (prevoff == -1) { + brelse(bp, 0); + return (EJUSTRETURN); + } + } else + prevoff = offset; + *prevoffp = prevoff; + } + + /* Check for sequential access, and update offset. */ + if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset) + dh->dh_seqopt = 1; + dh->dh_seqoff = offset + DIRSIZ(0, dp, needswap); + DIRHASH_UNLOCK(dh); + + *bpp = bp; + *offp = offset; + return (0); + } + + if (dh->dh_hash == NULL) { + DIRHASH_UNLOCK(dh); + if (bp != NULL) + brelse(bp, 0); + ufsdirhash_free(ip); + return (EJUSTRETURN); + } + /* + * When the name doesn't match in the seqopt case, go back + * and search normally. + */ + if (dh->dh_seqopt) { + dh->dh_seqopt = 0; + goto restart; + } + } + DIRHASH_UNLOCK(dh); + if (bp != NULL) + brelse(bp, 0); + return (ENOENT); +} + +/* + * Find a directory block with room for 'slotneeded' bytes. Returns + * the offset of the directory entry that begins the free space. + * This will either be the offset of an existing entry that has free + * space at the end, or the offset of an entry with d_ino == 0 at + * the start of a DIRBLKSIZ block. + * + * To use the space, the caller may need to compact existing entries in + * the directory. The total number of bytes in all of the entries involved + * in the compaction is stored in *slotsize. In other words, all of + * the entries that must be compacted are exactly contained in the + * region beginning at the returned offset and spanning *slotsize bytes. + * + * Returns -1 if no space was found, indicating that the directory + * must be extended. + */ +doff_t +ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize) +{ + struct direct *dp; + struct dirhash *dh; + struct buf *bp; + doff_t pos, slotstart; + int dirblock, error, freebytes, i; + const int needswap = UFS_MPNEEDSWAP(ip->i_ump); + int dirblksiz = ip->i_ump->um_dirblksiz; + + if ((dh = ip->i_dirhash) == NULL) + return (-1); + + DIRHASH_LOCK(dh); + if (dh->dh_hash == NULL) { + DIRHASH_UNLOCK(dh); + ufsdirhash_free(ip); + return (-1); + } + + /* Find a directory block with the desired free space. */ + dirblock = -1; + for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++) + if ((dirblock = dh->dh_firstfree[i]) != -1) + break; + if (dirblock == -1) { + DIRHASH_UNLOCK(dh); + return (-1); + } + + KASSERT(dirblock < dh->dh_nblk && + dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN)); + pos = dirblock * dirblksiz; + error = ufs_blkatoff(ip->i_vnode, (off_t)pos, (void *)&dp, &bp, false); + if (error) { + DIRHASH_UNLOCK(dh); + return (-1); + } + /* Find the first entry with free space. */ + for (i = 0; i < dirblksiz; ) { + if (dp->d_reclen == 0) { + DIRHASH_UNLOCK(dh); + brelse(bp, 0); + return (-1); + } + if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp, needswap)) + break; + i += dp->d_reclen; + dp = (struct direct *)((char *)dp + dp->d_reclen); + } + if (i > dirblksiz) { + DIRHASH_UNLOCK(dh); + brelse(bp, 0); + return (-1); + } + slotstart = pos + i; + + /* Find the range of entries needed to get enough space */ + freebytes = 0; + while (i < dirblksiz && freebytes < slotneeded) { + freebytes += dp->d_reclen; + if (dp->d_ino != 0) + freebytes -= DIRSIZ(0, dp, needswap); + if (dp->d_reclen == 0) { + DIRHASH_UNLOCK(dh); + brelse(bp, 0); + return (-1); + } + i += dp->d_reclen; + dp = (struct direct *)((char *)dp + dp->d_reclen); + } + if (i > dirblksiz) { + DIRHASH_UNLOCK(dh); + brelse(bp, 0); + return (-1); + } + if (freebytes < slotneeded) + panic("ufsdirhash_findfree: free mismatch"); + DIRHASH_UNLOCK(dh); + brelse(bp, 0); + *slotsize = pos + i - slotstart; + return (slotstart); +} + +/* + * Return the start of the unused space at the end of a directory, or + * -1 if there are no trailing unused blocks. + */ +doff_t +ufsdirhash_enduseful(struct inode *ip) +{ + struct dirhash *dh; + int i; + int dirblksiz = ip->i_ump->um_dirblksiz; + + if ((dh = ip->i_dirhash) == NULL) + return (-1); + + DIRHASH_LOCK(dh); + if (dh->dh_hash == NULL) { + DIRHASH_UNLOCK(dh); + ufsdirhash_free(ip); + return (-1); + } + + if (dh->dh_blkfree[dh->dh_dirblks - 1] != dirblksiz / DIRALIGN) { + DIRHASH_UNLOCK(dh); + return (-1); + } + + for (i = dh->dh_dirblks - 1; i >= 0; i--) + if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN) + break; + DIRHASH_UNLOCK(dh); + return ((doff_t)(i + 1) * dirblksiz); +} + +/* + * Insert information into the hash about a new directory entry. dirp + * points to a struct direct containing the entry, and offset specifies + * the offset of this entry. + */ +void +ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset) +{ + struct dirhash *dh; + int slot; + const int needswap = UFS_MPNEEDSWAP(ip->i_ump); + int dirblksiz = ip->i_ump->um_dirblksiz; + + if ((dh = ip->i_dirhash) == NULL) + return; + + DIRHASH_LOCK(dh); + if (dh->dh_hash == NULL) { + DIRHASH_UNLOCK(dh); + ufsdirhash_free(ip); + return; + } + + KASSERT(offset < dh->dh_dirblks * dirblksiz); + /* + * Normal hash usage is < 66%. If the usage gets too high then + * remove the hash entirely and let it be rebuilt later. + */ + if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) { + DIRHASH_UNLOCK(dh); + ufsdirhash_free(ip); + return; + } + + /* Find a free hash slot (empty or deleted), and add the entry. */ + slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen); + while (DH_ENTRY(dh, slot) >= 0) + slot = WRAPINCR(slot, dh->dh_hlen); + if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY) + dh->dh_hused++; + DH_ENTRY(dh, slot) = offset; + + /* Update the per-block summary info. */ + ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp, needswap), dirblksiz); + DIRHASH_UNLOCK(dh); +} + +/* + * Remove the specified directory entry from the hash. The entry to remove + * is defined by the name in `dirp', which must exist at the specified + * `offset' within the directory. + */ +void +ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset) +{ + struct dirhash *dh; + int slot; + const int needswap = UFS_MPNEEDSWAP(ip->i_ump); + int dirblksiz = ip->i_ump->um_dirblksiz; + + if ((dh = ip->i_dirhash) == NULL) + return; + + DIRHASH_LOCK(dh); + if (dh->dh_hash == NULL) { + DIRHASH_UNLOCK(dh); + ufsdirhash_free(ip); + return; + } + + KASSERT(offset < dh->dh_dirblks * dirblksiz); + /* Find the entry */ + slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset); + + /* Remove the hash entry. */ + ufsdirhash_delslot(dh, slot); + + /* Update the per-block summary info. */ + ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp, needswap), dirblksiz); + DIRHASH_UNLOCK(dh); +} + +/* + * Change the offset associated with a directory entry in the hash. Used + * when compacting directory blocks. + */ +void +ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff, + doff_t newoff) +{ + struct dirhash *dh; + int slot; + + if ((dh = ip->i_dirhash) == NULL) + return; + DIRHASH_LOCK(dh); + if (dh->dh_hash == NULL) { + DIRHASH_UNLOCK(dh); + ufsdirhash_free(ip); + return; + } + + KASSERT(oldoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz && + newoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz); + /* Find the entry, and update the offset. */ + slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff); + DH_ENTRY(dh, slot) = newoff; + DIRHASH_UNLOCK(dh); +} + +/* + * Inform dirhash that the directory has grown by one block that + * begins at offset (i.e. the new length is offset + DIRBLKSIZ). + */ +void +ufsdirhash_newblk(struct inode *ip, doff_t offset) +{ + struct dirhash *dh; + int block; + int dirblksiz = ip->i_ump->um_dirblksiz; + + if ((dh = ip->i_dirhash) == NULL) + return; + DIRHASH_LOCK(dh); + if (dh->dh_hash == NULL) { + DIRHASH_UNLOCK(dh); + ufsdirhash_free(ip); + return; + } + + KASSERT(offset == dh->dh_dirblks * dirblksiz); + block = offset / dirblksiz; + if (block >= dh->dh_nblk) { + /* Out of space; must rebuild. */ + DIRHASH_UNLOCK(dh); + ufsdirhash_free(ip); + return; + } + dh->dh_dirblks = block + 1; + + /* Account for the new free block. */ + dh->dh_blkfree[block] = dirblksiz / DIRALIGN; + if (dh->dh_firstfree[DH_NFSTATS] == -1) + dh->dh_firstfree[DH_NFSTATS] = block; + DIRHASH_UNLOCK(dh); +} + +/* + * Inform dirhash that the directory is being truncated. + */ +void +ufsdirhash_dirtrunc(struct inode *ip, doff_t offset) +{ + struct dirhash *dh; + int block, i; + int dirblksiz = ip->i_ump->um_dirblksiz; + + if ((dh = ip->i_dirhash) == NULL) + return; + + DIRHASH_LOCK(dh); + if (dh->dh_hash == NULL) { + DIRHASH_UNLOCK(dh); + ufsdirhash_free(ip); + return; + } + + KASSERT(offset <= dh->dh_dirblks * dirblksiz); + block = howmany(offset, dirblksiz); + /* + * If the directory shrinks to less than 1/8 of dh_nblk blocks + * (about 20% of its original size due to the 50% extra added in + * ufsdirhash_build) then free it, and let the caller rebuild + * if necessary. + */ + if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) { + DIRHASH_UNLOCK(dh); + ufsdirhash_free(ip); + return; + } + + /* + * Remove any `first free' information pertaining to the + * truncated blocks. All blocks we're removing should be + * completely unused. + */ + if (dh->dh_firstfree[DH_NFSTATS] >= block) + dh->dh_firstfree[DH_NFSTATS] = -1; + for (i = block; i < dh->dh_dirblks; i++) + if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN) + panic("ufsdirhash_dirtrunc: blocks in use"); + for (i = 0; i < DH_NFSTATS; i++) + if (dh->dh_firstfree[i] >= block) + panic("ufsdirhash_dirtrunc: first free corrupt"); + dh->dh_dirblks = block; + DIRHASH_UNLOCK(dh); +} + +/* + * Debugging function to check that the dirhash information about + * a directory block matches its actual contents. Panics if a mismatch + * is detected. + * + * On entry, `sbuf' should point to the start of an in-core + * DIRBLKSIZ-sized directory block, and `offset' should contain the + * offset from the start of the directory of that block. + */ +void +ufsdirhash_checkblock(struct inode *ip, char *sbuf, doff_t offset) +{ + struct dirhash *dh; + struct direct *dp; + int block, ffslot, i, nfree; + const int needswap = UFS_MPNEEDSWAP(ip->i_ump); + int dirblksiz = ip->i_ump->um_dirblksiz; + + if (!ufs_dirhashcheck) + return; + if ((dh = ip->i_dirhash) == NULL) + return; + + DIRHASH_LOCK(dh); + if (dh->dh_hash == NULL) { + DIRHASH_UNLOCK(dh); + ufsdirhash_free(ip); + return; + } + + block = offset / dirblksiz; + if ((offset & (dirblksiz - 1)) != 0 || block >= dh->dh_dirblks) + panic("ufsdirhash_checkblock: bad offset"); + + nfree = 0; + for (i = 0; i < dirblksiz; i += dp->d_reclen) { + dp = (struct direct *)(sbuf + i); + if (dp->d_reclen == 0 || i + dp->d_reclen > dirblksiz) + panic("ufsdirhash_checkblock: bad dir"); + + if (dp->d_ino == 0) { +#if 0 + /* + * XXX entries with d_ino == 0 should only occur + * at the start of a DIRBLKSIZ block. However the + * ufs code is tolerant of such entries at other + * offsets, and fsck does not fix them. + */ + if (i != 0) + panic("ufsdirhash_checkblock: bad dir inode"); +#endif + nfree += dp->d_reclen; + continue; + } + + /* Check that the entry exists (will panic if it doesn't). */ + ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i); + + nfree += dp->d_reclen - DIRSIZ(0, dp, needswap); + } + if (i != dirblksiz) + panic("ufsdirhash_checkblock: bad dir end"); + + if (dh->dh_blkfree[block] * DIRALIGN != nfree) + panic("ufsdirhash_checkblock: bad free count"); + + ffslot = BLKFREE2IDX(nfree / DIRALIGN); + for (i = 0; i <= DH_NFSTATS; i++) + if (dh->dh_firstfree[i] == block && i != ffslot) + panic("ufsdirhash_checkblock: bad first-free"); + if (dh->dh_firstfree[ffslot] == -1) + panic("ufsdirhash_checkblock: missing first-free entry"); + DIRHASH_UNLOCK(dh); +} + +/* + * Hash the specified filename into a dirhash slot. + */ +static int +ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen) +{ + u_int32_t hash; + + /* + * We hash the name and then some other bit of data that is + * invariant over the dirhash's lifetime. Otherwise names + * differing only in the last byte are placed close to one + * another in the table, which is bad for linear probing. + */ + hash = hash32_buf(name, namelen, HASH32_BUF_INIT); + hash = hash32_buf(&dh, sizeof(dh), hash); + return (hash % dh->dh_hlen); +} + +/* + * Adjust the number of free bytes in the block containing `offset' + * by the value specified by `diff'. + * + * The caller must ensure we have exclusive access to `dh'; normally + * that means that dh_lock should be held, but this is also called + * from ufsdirhash_build() where exclusive access can be assumed. + */ +static void +ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff, int dirblksiz) +{ + int block, i, nfidx, ofidx; + + KASSERT(mutex_owned(&dh->dh_lock)); + + /* Update the per-block summary info. */ + block = offset / dirblksiz; + KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks); + ofidx = BLKFREE2IDX(dh->dh_blkfree[block]); + dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN); + nfidx = BLKFREE2IDX(dh->dh_blkfree[block]); + + /* Update the `first free' list if necessary. */ + if (ofidx != nfidx) { + /* If removing, scan forward for the next block. */ + if (dh->dh_firstfree[ofidx] == block) { + for (i = block + 1; i < dh->dh_dirblks; i++) + if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx) + break; + dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1; + } + + /* Make this the new `first free' if necessary */ + if (dh->dh_firstfree[nfidx] > block || + dh->dh_firstfree[nfidx] == -1) + dh->dh_firstfree[nfidx] = block; + } +} + +/* + * Find the specified name which should have the specified offset. + * Returns a slot number, and panics on failure. + * + * `dh' must be locked on entry and remains so on return. + */ +static int +ufsdirhash_findslot(struct dirhash *dh, const char *name, int namelen, + doff_t offset) +{ + int slot; + + KASSERT(mutex_owned(&dh->dh_lock)); + + /* Find the entry. */ + KASSERT(dh->dh_hused < dh->dh_hlen); + slot = ufsdirhash_hash(dh, name, namelen); + while (DH_ENTRY(dh, slot) != offset && + DH_ENTRY(dh, slot) != DIRHASH_EMPTY) + slot = WRAPINCR(slot, dh->dh_hlen); + if (DH_ENTRY(dh, slot) != offset) + panic("ufsdirhash_findslot: '%.*s' not found", namelen, name); + + return (slot); +} + +/* + * Remove the entry corresponding to the specified slot from the hash array. + * + * `dh' must be locked on entry and remains so on return. + */ +static void +ufsdirhash_delslot(struct dirhash *dh, int slot) +{ + int i; + + KASSERT(mutex_owned(&dh->dh_lock)); + + /* Mark the entry as deleted. */ + DH_ENTRY(dh, slot) = DIRHASH_DEL; + + /* If this is the end of a chain of DIRHASH_DEL slots, remove them. */ + for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) + i = WRAPINCR(i, dh->dh_hlen); + if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) { + i = WRAPDECR(i, dh->dh_hlen); + while (DH_ENTRY(dh, i) == DIRHASH_DEL) { + DH_ENTRY(dh, i) = DIRHASH_EMPTY; + dh->dh_hused--; + i = WRAPDECR(i, dh->dh_hlen); + } + KASSERT(dh->dh_hused >= 0); + } +} + +/* + * Given a directory entry and its offset, find the offset of the + * previous entry in the same DIRBLKSIZ-sized block. Returns an + * offset, or -1 if there is no previous entry in the block or some + * other problem occurred. + */ +static doff_t +ufsdirhash_getprev(struct direct *dirp, doff_t offset, int dirblksiz) +{ + struct direct *dp; + char *blkbuf; + doff_t blkoff, prevoff; + int entrypos, i; + + blkoff = offset & ~(dirblksiz - 1); /* offset of start of block */ + entrypos = offset & (dirblksiz - 1); /* entry relative to block */ + blkbuf = (char *)dirp - entrypos; + prevoff = blkoff; + + /* If `offset' is the start of a block, there is no previous entry. */ + if (entrypos == 0) + return (-1); + + /* Scan from the start of the block until we get to the entry. */ + for (i = 0; i < entrypos; i += dp->d_reclen) { + dp = (struct direct *)(blkbuf + i); + if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos) + return (-1); /* Corrupted directory. */ + prevoff = blkoff + i; + } + return (prevoff); +} + +/* + * Try to free up `wanted' bytes by stealing memory from existing + * dirhashes. Returns zero with list locked if successful. + */ +static int +ufsdirhash_recycle(int wanted) +{ + struct dirhash *dh; + doff_t **hash; + u_int8_t *blkfree; + int i, mem, narrays; + size_t hashsz, blkfreesz; + + DIRHASHLIST_LOCK(); + while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) { + /* Find a dirhash, and lock it. */ + if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) { + DIRHASHLIST_UNLOCK(); + return (-1); + } + DIRHASH_LOCK(dh); + KASSERT(dh->dh_hash != NULL); + + /* Decrement the score; only recycle if it becomes zero. */ + if (--dh->dh_score > 0) { + DIRHASH_UNLOCK(dh); + DIRHASHLIST_UNLOCK(); + return (-1); + } + + /* Remove it from the list and detach its memory. */ + TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); + dh->dh_onlist = 0; + hash = dh->dh_hash; + hashsz = dh->dh_hashsz; + dh->dh_hash = NULL; + blkfree = dh->dh_blkfree; + blkfreesz = dh->dh_blkfreesz; + dh->dh_blkfree = NULL; + narrays = dh->dh_narrays; + mem = narrays * sizeof(*dh->dh_hash) + + narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + + dh->dh_nblk * sizeof(*dh->dh_blkfree); + + /* Unlock everything, free the detached memory. */ + DIRHASH_UNLOCK(dh); + DIRHASHLIST_UNLOCK(); + + for (i = 0; i < narrays; i++) + DIRHASH_BLKFREE(hash[i]); + kmem_free(hash, hashsz); + kmem_free(blkfree, blkfreesz); + + /* Account for the returned memory, and repeat if necessary. */ + DIRHASHLIST_LOCK(); + atomic_add_int(&ufs_dirhashmem, -mem); + } + /* Success. */ + return (0); +} + +static void +ufsdirhash_sysctl_init(void) +{ + const struct sysctlnode *rnode, *cnode; + + sysctl_createv(&ufsdirhash_sysctl_log, 0, NULL, &rnode, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "vfs", NULL, + NULL, 0, NULL, 0, + CTL_VFS, CTL_EOL); + + sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &rnode, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "ufs", + SYSCTL_DESCR("ufs"), + NULL, 0, NULL, 0, + CTL_CREATE, CTL_EOL); + + sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &rnode, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "dirhash", + SYSCTL_DESCR("dirhash"), + NULL, 0, NULL, 0, + CTL_CREATE, CTL_EOL); + + sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "minblocks", + SYSCTL_DESCR("minimum hashed directory size in blocks"), + NULL, 0, &ufs_dirhashminblks, 0, + CTL_CREATE, CTL_EOL); + + sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "maxmem", + SYSCTL_DESCR("maximum dirhash memory usage"), + NULL, 0, &ufs_dirhashmaxmem, 0, + CTL_CREATE, CTL_EOL); + + sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode, + CTLFLAG_PERMANENT|CTLFLAG_READONLY, + CTLTYPE_INT, "memused", + SYSCTL_DESCR("current dirhash memory usage"), + NULL, 0, &ufs_dirhashmem, 0, + CTL_CREATE, CTL_EOL); + + sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "docheck", + SYSCTL_DESCR("enable extra sanity checks"), + NULL, 0, &ufs_dirhashcheck, 0, + CTL_CREATE, CTL_EOL); +} + +void +ufsdirhash_init(void) +{ + + mutex_init(&ufsdirhash_lock, MUTEX_DEFAULT, IPL_NONE); + ufsdirhashblk_cache = pool_cache_init(DH_NBLKOFF * sizeof(daddr_t), 0, + 0, 0, "dirhashblk", NULL, IPL_NONE, NULL, NULL, NULL); + ufsdirhash_cache = pool_cache_init(sizeof(struct dirhash), 0, + 0, 0, "dirhash", NULL, IPL_NONE, NULL, NULL, NULL); + TAILQ_INIT(&ufsdirhash_list); + ufsdirhash_sysctl_init(); +} + +void +ufsdirhash_done(void) +{ + + KASSERT(TAILQ_EMPTY(&ufsdirhash_list)); + pool_cache_destroy(ufsdirhashblk_cache); + pool_cache_destroy(ufsdirhash_cache); + mutex_destroy(&ufsdirhash_lock); + sysctl_teardown(&ufsdirhash_sysctl_log); +} diff --git a/sys/ufs/ufs/ufs_extattr.c b/sys/ufs/ufs/ufs_extattr.c new file mode 100644 index 000000000..8b456b858 --- /dev/null +++ b/sys/ufs/ufs/ufs_extattr.c @@ -0,0 +1,1551 @@ +/* $NetBSD: ufs_extattr.c,v 1.35 2011/07/07 14:56:45 manu Exp $ */ + +/*- + * Copyright (c) 1999-2002 Robert N. M. Watson + * Copyright (c) 2002-2003 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed by Robert Watson for the TrustedBSD Project. + * + * This software was developed for the FreeBSD Project in part by Network + * Associates Laboratories, the Security Research Division of Network + * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), + * as part of the DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +/* + * Support for file system extended attributes on the UFS1 file system. + * + * Extended attributes are defined in the form name=value, where name is + * a nul-terminated string in the style of a file name, and value is a + * binary blob of zero or more bytes. The UFS1 extended attribute service + * layers support for extended attributes onto a backing file, in the style + * of the quota implementation, meaning that it requires no underlying format + * changes to the file system. This design choice exchanges simplicity, + * usability, and easy deployment for performance. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ufs_extattr.c,v 1.35 2011/07/07 14:56:45 manu Exp $"); + +#ifdef _KERNEL_OPT +#include "opt_ffs.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static MALLOC_JUSTDEFINE(M_UFS_EXTATTR, "ufs_extattr","ufs extended attribute"); + +int ufs_extattr_sync = 1; +int ufs_extattr_autocreate = 1024; + +static int ufs_extattr_valid_attrname(int attrnamespace, + const char *attrname); +static int ufs_extattr_enable_with_open(struct ufsmount *ump, + struct vnode *vp, int attrnamespace, const char *attrname, + struct lwp *l); +static int ufs_extattr_enable(struct ufsmount *ump, int attrnamespace, + const char *attrname, struct vnode *backing_vnode, + struct lwp *l); +static int ufs_extattr_disable(struct ufsmount *ump, int attrnamespace, + const char *attrname, struct lwp *l); +static int ufs_extattr_get(struct vnode *vp, int attrnamespace, + const char *name, struct uio *uio, size_t *size, + kauth_cred_t cred, struct lwp *l); +static int ufs_extattr_list(struct vnode *vp, int attrnamespace, + struct uio *uio, size_t *size, int flag, + kauth_cred_t cred, struct lwp *l); +static int ufs_extattr_set(struct vnode *vp, int attrnamespace, + const char *name, struct uio *uio, kauth_cred_t cred, + struct lwp *l); +static int ufs_extattr_rm(struct vnode *vp, int attrnamespace, + const char *name, kauth_cred_t cred, struct lwp *l); +static struct ufs_extattr_list_entry *ufs_extattr_find_attr(struct ufsmount *, + int, const char *); +static int ufs_extattr_get_header(struct vnode *, + struct ufs_extattr_list_entry *, + struct ufs_extattr_header *, off_t *); + +/* + * Per-FS attribute lock protecting attribute operations. + * XXX Right now there is a lot of lock contention due to having a single + * lock per-FS; really, this should be far more fine-grained. + */ +static void +ufs_extattr_uepm_lock(struct ufsmount *ump) +{ + + /* XXX Why does this need to be recursive? */ + if (mutex_owned(&ump->um_extattr.uepm_lock)) { + ump->um_extattr.uepm_lockcnt++; + return; + } + mutex_enter(&ump->um_extattr.uepm_lock); +} + +static void +ufs_extattr_uepm_unlock(struct ufsmount *ump) +{ + + if (ump->um_extattr.uepm_lockcnt != 0) { + KASSERT(mutex_owned(&ump->um_extattr.uepm_lock)); + ump->um_extattr.uepm_lockcnt--; + return; + } + mutex_exit(&ump->um_extattr.uepm_lock); +} + +/*- + * Determine whether the name passed is a valid name for an actual + * attribute. + * + * Invalid currently consists of: + * NULL pointer for attrname + * zero-length attrname (used to retrieve application attribute list) + */ +static int +ufs_extattr_valid_attrname(int attrnamespace, const char *attrname) +{ + + if (attrname == NULL) + return (0); + if (strlen(attrname) == 0) + return (0); + return (1); +} + +/* + * Autocreate an attribute storage + */ +static struct ufs_extattr_list_entry * +ufs_extattr_autocreate_attr(struct vnode *vp, int attrnamespace, + const char *attrname, struct lwp *l) +{ + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + struct vnode *backing_vp; + struct nameidata nd; + struct pathbuf *pb; + char *path; + struct ufs_extattr_fileheader uef; + struct ufs_extattr_list_entry *uele; + int error; + + path = PNBUF_GET(); + + /* + * We only support system and user namespace autocreation + */ + switch (attrnamespace) { + case EXTATTR_NAMESPACE_SYSTEM: + (void)snprintf(path, PATH_MAX, "%s/%s/%s/%s", + mp->mnt_stat.f_mntonname, + UFS_EXTATTR_FSROOTSUBDIR, + UFS_EXTATTR_SUBDIR_SYSTEM, + attrname); + break; + case EXTATTR_NAMESPACE_USER: + (void)snprintf(path, PATH_MAX, "%s/%s/%s/%s", + mp->mnt_stat.f_mntonname, + UFS_EXTATTR_FSROOTSUBDIR, + UFS_EXTATTR_SUBDIR_USER, + attrname); + break; + default: + PNBUF_PUT(path); + return NULL; + break; + } + + /* + * When setting attribute on the root vnode, we get it + * already locked, and vn_open/namei/VFS_ROOT will try to + * look it, causing a panic. Unlock it first. + */ + if (vp->v_vflag && VV_ROOT) { + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); + VOP_UNLOCK(vp); + } + KASSERT(VOP_ISLOCKED(vp) == 0); + + pb = pathbuf_create(path); + NDINIT(&nd, CREATE, LOCKPARENT, pb); + + error = vn_open(&nd, O_CREAT|O_RDWR, 0600); + + /* + * Reacquire the lock on the vnode if it was root. + */ + KASSERT(VOP_ISLOCKED(vp) == 0); + if (vp->v_vflag && VV_ROOT) + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); + + if (error != 0) { + pathbuf_destroy(pb); + PNBUF_PUT(path); + return NULL; + } + + KASSERT(nd.ni_vp != NULL); + KASSERT(VOP_ISLOCKED(nd.ni_vp) == LK_EXCLUSIVE); + KASSERT(VOP_ISLOCKED(nd.ni_dvp) == 0); + + /* + * backing_vp is the backing store. + */ + backing_vp = nd.ni_vp; + pathbuf_destroy(pb); + PNBUF_PUT(path); + + uef.uef_magic = UFS_EXTATTR_MAGIC; + uef.uef_version = UFS_EXTATTR_VERSION; + uef.uef_size = ufs_extattr_autocreate; + + error = vn_rdwr(UIO_WRITE, backing_vp, &uef, sizeof(uef), 0, + UIO_SYSSPACE, IO_NODELOCKED|IO_APPEND, + l->l_cred, NULL, l); + + VOP_UNLOCK(backing_vp); + + if (error != 0) { + printf("%s: write uef header failed for %s, error = %d\n", + __func__, attrname, error); + vn_close(backing_vp, FREAD|FWRITE, l->l_cred); + return NULL; + } + + /* + * ufs_extattr_enable_with_open increases the vnode reference + * count. Not sure why, but do the same here. + */ + vref(vp); + + /* + * Now enable attribute. + */ + error = ufs_extattr_enable(ump,attrnamespace, attrname, backing_vp, l); + KASSERT(VOP_ISLOCKED(backing_vp) == 0); + + if (error != 0) { + printf("%s: enable %s failed, error %d\n", + __func__, attrname, error); + vn_close(backing_vp, FREAD|FWRITE, l->l_cred); + return NULL; + } + + uele = ufs_extattr_find_attr(ump, attrnamespace, attrname); + if (uele == NULL) { + printf("%s: atttribute %s created but not found!\n", + __func__, attrname); + vn_close(backing_vp, FREAD|FWRITE, l->l_cred); + return NULL; + } + + printf("%s: EA backing store autocreated for %s\n", + mp->mnt_stat.f_mntonname, attrname); + + return uele; +} + +/* + * Locate an attribute given a name and mountpoint. + * Must be holding uepm lock for the mount point. + */ +static struct ufs_extattr_list_entry * +ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace, + const char *attrname) +{ + struct ufs_extattr_list_entry *search_attribute; + + for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list); + search_attribute != NULL; + search_attribute = LIST_NEXT(search_attribute, uele_entries)) { + if (!(strncmp(attrname, search_attribute->uele_attrname, + UFS_EXTATTR_MAXEXTATTRNAME)) && + (attrnamespace == search_attribute->uele_attrnamespace)) { + return (search_attribute); + } + } + + return (0); +} + +/* + * Initialize per-FS structures supporting extended attributes. Do not + * start extended attributes yet. + */ +void +ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm) +{ + + uepm->uepm_flags = 0; + uepm->uepm_lockcnt = 0; + + LIST_INIT(&uepm->uepm_list); + mutex_init(&uepm->uepm_lock, MUTEX_DEFAULT, IPL_NONE); + uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED; +} + +/* + * Destroy per-FS structures supporting extended attributes. Assumes + * that EAs have already been stopped, and will panic if not. + */ +void +ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm) +{ + + if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) + panic("ufs_extattr_uepm_destroy: not initialized"); + + if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED)) + panic("ufs_extattr_uepm_destroy: called while still started"); + + /* + * It's not clear that either order for the next two lines is + * ideal, and it should never be a problem if this is only called + * during unmount, and with vfs_busy(). + */ + uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED; + mutex_destroy(&uepm->uepm_lock); +} + +/* + * Start extended attribute support on an FS. + */ +int +ufs_extattr_start(struct mount *mp, struct lwp *l) +{ + struct ufsmount *ump; + int error = 0; + + ump = VFSTOUFS(mp); + + ufs_extattr_uepm_lock(ump); + + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) { + error = EOPNOTSUPP; + goto unlock; + } + if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) { + error = EBUSY; + goto unlock; + } + + ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED; + + ump->um_extattr.uepm_ucred = l->l_cred; + kauth_cred_hold(ump->um_extattr.uepm_ucred); + + unlock: + ufs_extattr_uepm_unlock(ump); + + return (error); +} + +/* + * Helper routine: given a locked parent directory and filename, return + * the locked vnode of the inode associated with the name. Will not + * follow symlinks, may return any type of vnode. Lock on parent will + * be released even in the event of a failure. In the event that the + * target is the parent (i.e., "."), there will be two references and + * one lock, requiring the caller to possibly special-case. + */ +static int +ufs_extattr_lookup(struct vnode *start_dvp, int lockparent, const char *dirname, + struct vnode **vp, struct lwp *l) +{ + struct vop_lookup_args vargs; + struct componentname cnp; + struct vnode *target_vp; + char *pnbuf; + int error; + + KASSERT(VOP_ISLOCKED(start_dvp) == LK_EXCLUSIVE); + + pnbuf = PNBUF_GET(); + + memset(&cnp, 0, sizeof(cnp)); + cnp.cn_nameiop = LOOKUP; + cnp.cn_flags = ISLASTCN | lockparent; + cnp.cn_cred = l->l_cred; + cnp.cn_nameptr = pnbuf; + error = copystr(dirname, pnbuf, MAXPATHLEN, &cnp.cn_namelen); + if (error) { + if (lockparent == 0) { + VOP_UNLOCK(start_dvp); + } + PNBUF_PUT(pnbuf); + printf("ufs_extattr_lookup: copystr failed\n"); + return (error); + } + cnp.cn_namelen--; /* trim nul termination */ + vargs.a_desc = NULL; + vargs.a_dvp = start_dvp; + vargs.a_vpp = &target_vp; + vargs.a_cnp = &cnp; + error = ufs_lookup(&vargs); + PNBUF_PUT(pnbuf); + if (error) { + if (lockparent == 0) { + VOP_UNLOCK(start_dvp); + } + return (error); + } +#if 0 + if (target_vp == start_dvp) + panic("ufs_extattr_lookup: target_vp == start_dvp"); +#endif + + if ((target_vp != start_dvp) && (lockparent == 0)) + VOP_UNLOCK(start_dvp); + + KASSERT(VOP_ISLOCKED(target_vp) == LK_EXCLUSIVE); + *vp = target_vp; + return (0); +} + +/* + * Enable an EA using the passed filesystem, backing vnode, attribute name, + * namespace, and proc. Will perform a VOP_OPEN() on the vp, so expects vp + * to be locked when passed in. The vnode will be returned unlocked, + * regardless of success/failure of the function. As a result, the caller + * will always need to vrele(), but not vput(). + */ +static int +ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp, + int attrnamespace, const char *attrname, struct lwp *l) +{ + int error; + + error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred); + if (error) { + printf("ufs_extattr_enable_with_open.VOP_OPEN(): failed " + "with %d\n", error); + VOP_UNLOCK(vp); + return (error); + } + + mutex_enter(vp->v_interlock); + vp->v_writecount++; + mutex_exit(vp->v_interlock); + + vref(vp); + + VOP_UNLOCK(vp); + + error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, l); + if (error != 0) + vn_close(vp, FREAD|FWRITE, l->l_cred); + return (error); +} + +/* + * Given a locked directory vnode, iterate over the names in the directory + * and use ufs_extattr_lookup() to retrieve locked vnodes of potential + * attribute files. Then invoke ufs_extattr_enable_with_open() on each + * to attempt to start the attribute. Leaves the directory locked on + * exit. + */ +static int +ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp, + int attrnamespace, struct lwp *l) +{ + struct vop_readdir_args vargs; + struct statvfs *sbp = &ump->um_mountp->mnt_stat; + struct dirent *dp, *edp; + struct vnode *attr_vp; + struct uio auio; + struct iovec aiov; + char *dirbuf; + int error, eofflag = 0; + + if (dvp->v_type != VDIR) + return (ENOTDIR); + + dirbuf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK); + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_offset = 0; + UIO_SETUP_SYSSPACE(&auio); + + vargs.a_desc = NULL; + vargs.a_vp = dvp; + vargs.a_uio = &auio; + vargs.a_cred = l->l_cred; + vargs.a_eofflag = &eofflag; + vargs.a_ncookies = NULL; + vargs.a_cookies = NULL; + + while (!eofflag) { + auio.uio_resid = DIRBLKSIZ; + aiov.iov_base = dirbuf; + aiov.iov_len = DIRBLKSIZ; + error = ufs_readdir(&vargs); + if (error) { + printf("ufs_extattr_iterate_directory: ufs_readdir " + "%d\n", error); + return (error); + } + + /* + * XXXRW: While in UFS, we always get DIRBLKSIZ returns from + * the directory code on success, on other file systems this + * may not be the case. For portability, we should check the + * read length on return from ufs_readdir(). + */ + edp = (struct dirent *)&dirbuf[DIRBLKSIZ]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { + if (dp->d_reclen == 0) + break; + /* Skip "." and ".." */ + if (dp->d_name[0] == '.' && + (dp->d_name[1] == '\0' || + (dp->d_name[1] == '.' && dp->d_name[2] == '\0'))) + goto next; + error = ufs_extattr_lookup(dvp, LOCKPARENT, + dp->d_name, &attr_vp, l); + if (error == ENOENT) { + goto next; /* keep silent */ + } else if (error) { + printf("ufs_extattr_iterate_directory: lookup " + "%s %d\n", dp->d_name, error); + } else if (attr_vp == dvp) { + vrele(attr_vp); + } else if (attr_vp->v_type != VREG) { + vput(attr_vp); + } else { + error = ufs_extattr_enable_with_open(ump, + attr_vp, attrnamespace, dp->d_name, l); + vrele(attr_vp); + if (error) { + printf("ufs_extattr_iterate_directory: " + "enable %s %d\n", dp->d_name, + error); + } else if (bootverbose) { + printf("%s: EA %s loaded\n", + sbp->f_mntonname, dp->d_name); + } + } + next: + dp = (struct dirent *) ((char *)dp + dp->d_reclen); + if (dp >= edp) + break; + } + } + free(dirbuf, M_TEMP); + + return (0); +} + +/* + * Auto-start of extended attributes, to be executed (optionally) at + * mount-time. + */ +int +ufs_extattr_autostart(struct mount *mp, struct lwp *l) +{ + struct vnode *rvp, *attr_dvp, *attr_system_dvp, *attr_user_dvp; + int error; + + /* + * Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root? + * If so, automatically start EA's. + */ + error = VFS_ROOT(mp, &rvp); + if (error) { + printf("ufs_extattr_autostart.VFS_ROOT() returned %d\n", + error); + return (error); + } + + KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE); + + error = ufs_extattr_lookup(rvp, 0, + UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, l); + if (error) { + /* rvp ref'd but now unlocked */ + KASSERT(VOP_ISLOCKED(rvp) == 0); + vrele(rvp); + return (error); + } + if (rvp == attr_dvp) { + /* Should never happen. */ + KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE); + vrele(attr_dvp); + vput(rvp); + return (EINVAL); + } + KASSERT(VOP_ISLOCKED(rvp) == 0); + vrele(rvp); + + KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE); + + if (attr_dvp->v_type != VDIR) { + printf("ufs_extattr_autostart: %s != VDIR\n", + UFS_EXTATTR_FSROOTSUBDIR); + goto return_vput_attr_dvp; + } + + error = ufs_extattr_start(mp, l); + if (error) { + printf("ufs_extattr_autostart: ufs_extattr_start failed (%d)\n", + error); + goto return_vput_attr_dvp; + } + + /* + * Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM, + * UFS_EXTATTR_SUBDIR_USER. For each, iterate over the sub-directory, + * and start with appropriate type. Failures in either don't + * result in an over-all failure. attr_dvp is left locked to + * be cleaned up on exit. + */ + error = ufs_extattr_lookup(attr_dvp, LOCKPARENT, + UFS_EXTATTR_SUBDIR_SYSTEM, &attr_system_dvp, l); + KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE); + if (error == 0) { + KASSERT(VOP_ISLOCKED(attr_system_dvp) == LK_EXCLUSIVE); + error = ufs_extattr_iterate_directory(VFSTOUFS(mp), + attr_system_dvp, EXTATTR_NAMESPACE_SYSTEM, l); + if (error) + printf("ufs_extattr_iterate_directory returned %d\n", + error); + KASSERT(VOP_ISLOCKED(attr_system_dvp) == LK_EXCLUSIVE); + vput(attr_system_dvp); + } + + error = ufs_extattr_lookup(attr_dvp, LOCKPARENT, + UFS_EXTATTR_SUBDIR_USER, &attr_user_dvp, l); + KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE); + if (error == 0) { + KASSERT(VOP_ISLOCKED(attr_user_dvp) == LK_EXCLUSIVE); + error = ufs_extattr_iterate_directory(VFSTOUFS(mp), + attr_user_dvp, EXTATTR_NAMESPACE_USER, l); + if (error) + printf("ufs_extattr_iterate_directory returned %d\n", + error); + KASSERT(VOP_ISLOCKED(attr_user_dvp) == LK_EXCLUSIVE); + vput(attr_user_dvp); + } + + /* Mask startup failures in sub-directories. */ + error = 0; + + return_vput_attr_dvp: + KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE); + vput(attr_dvp); + + return (error); +} + +/* + * Stop extended attribute support on an FS. + */ +void +ufs_extattr_stop(struct mount *mp, struct lwp *l) +{ + struct ufs_extattr_list_entry *uele; + struct ufsmount *ump = VFSTOUFS(mp); + + ufs_extattr_uepm_lock(ump); + + /* + * If we haven't been started, no big deal. Just short-circuit + * the processing work. + */ + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { + goto unlock; + } + + while (LIST_FIRST(&ump->um_extattr.uepm_list) != NULL) { + uele = LIST_FIRST(&ump->um_extattr.uepm_list); + ufs_extattr_disable(ump, uele->uele_attrnamespace, + uele->uele_attrname, l); + } + + ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED; + + kauth_cred_free(ump->um_extattr.uepm_ucred); + ump->um_extattr.uepm_ucred = NULL; + + unlock: + ufs_extattr_uepm_unlock(ump); +} + +/* + * Enable a named attribute on the specified filesystem; provide an + * unlocked backing vnode to hold the attribute data. + */ +static int +ufs_extattr_enable(struct ufsmount *ump, int attrnamespace, + const char *attrname, struct vnode *backing_vnode, struct lwp *l) +{ + struct ufs_extattr_list_entry *attribute; + struct iovec aiov; + struct uio auio; + int error = 0; + + if (!ufs_extattr_valid_attrname(attrnamespace, attrname)) + return (EINVAL); + if (backing_vnode->v_type != VREG) + return (EINVAL); + + attribute = malloc(sizeof(*attribute), M_UFS_EXTATTR, + M_WAITOK | M_ZERO); + + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { + error = EOPNOTSUPP; + goto free_exit; + } + + if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) { + error = EEXIST; + goto free_exit; + } + + strncpy(attribute->uele_attrname, attrname, + UFS_EXTATTR_MAXEXTATTRNAME); + attribute->uele_attrnamespace = attrnamespace; + memset(&attribute->uele_fileheader, 0, + sizeof(struct ufs_extattr_fileheader)); + + attribute->uele_backing_vnode = backing_vnode; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (void *) &attribute->uele_fileheader; + aiov.iov_len = sizeof(struct ufs_extattr_fileheader); + auio.uio_resid = sizeof(struct ufs_extattr_fileheader); + auio.uio_offset = (off_t) 0; + auio.uio_rw = UIO_READ; + UIO_SETUP_SYSSPACE(&auio); + + vn_lock(backing_vnode, LK_SHARED | LK_RETRY); + error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED, + ump->um_extattr.uepm_ucred); + + if (error) + goto unlock_free_exit; + + if (auio.uio_resid != 0) { + printf("ufs_extattr_enable: malformed attribute header\n"); + error = EINVAL; + goto unlock_free_exit; + } + + /* + * Try to determine the byte order of the attribute file. + */ + if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) { + attribute->uele_flags |= UELE_F_NEEDSWAP; + attribute->uele_fileheader.uef_magic = + ufs_rw32(attribute->uele_fileheader.uef_magic, + UELE_NEEDSWAP(attribute)); + if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) { + printf("ufs_extattr_enable: invalid attribute header " + "magic\n"); + error = EINVAL; + goto unlock_free_exit; + } + } + attribute->uele_fileheader.uef_version = + ufs_rw32(attribute->uele_fileheader.uef_version, + UELE_NEEDSWAP(attribute)); + attribute->uele_fileheader.uef_size = + ufs_rw32(attribute->uele_fileheader.uef_size, + UELE_NEEDSWAP(attribute)); + + if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) { + printf("ufs_extattr_enable: incorrect attribute header " + "version\n"); + error = EINVAL; + goto unlock_free_exit; + } + + LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute, + uele_entries); + + VOP_UNLOCK(backing_vnode); + return (0); + + unlock_free_exit: + VOP_UNLOCK(backing_vnode); + + free_exit: + free(attribute, M_UFS_EXTATTR); + return (error); +} + +/* + * Disable extended attribute support on an FS. + */ +static int +ufs_extattr_disable(struct ufsmount *ump, int attrnamespace, + const char *attrname, struct lwp *l) +{ + struct ufs_extattr_list_entry *uele; + int error = 0; + + if (!ufs_extattr_valid_attrname(attrnamespace, attrname)) + return (EINVAL); + + uele = ufs_extattr_find_attr(ump, attrnamespace, attrname); + if (!uele) + return (ENOATTR); + + LIST_REMOVE(uele, uele_entries); + + error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE, + l->l_cred); + + free(uele, M_UFS_EXTATTR); + + return (error); +} + +/* + * VFS call to manage extended attributes in UFS. If filename_vp is + * non-NULL, it must be passed in locked, and regardless of errors in + * processing, will be unlocked. + */ +int +ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, + int attrnamespace, const char *attrname) +{ + struct lwp *l = curlwp; + struct ufsmount *ump = VFSTOUFS(mp); + int error; + + /* + * Only privileged processes can configure extended attributes. + */ + if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, + NULL)) != 0) { + if (filename_vp != NULL) + VOP_UNLOCK(filename_vp); + return (error); + } + + switch(cmd) { + case UFS_EXTATTR_CMD_START: + if (filename_vp != NULL) { + VOP_UNLOCK(filename_vp); + return (EINVAL); + } + if (attrname != NULL) + return (EINVAL); + + error = ufs_extattr_autostart(mp, l); + return (error); + + case UFS_EXTATTR_CMD_STOP: + if (filename_vp != NULL) { + VOP_UNLOCK(filename_vp); + return (EINVAL); + } + if (attrname != NULL) + return (EINVAL); + + ufs_extattr_stop(mp, l); + return (0); + + case UFS_EXTATTR_CMD_ENABLE: + if (filename_vp == NULL) + return (EINVAL); + if (attrname == NULL) { + VOP_UNLOCK(filename_vp); + return (EINVAL); + } + + /* + * ufs_extattr_enable_with_open() will always unlock the + * vnode, regardless of failure. + */ + ufs_extattr_uepm_lock(ump); + error = ufs_extattr_enable_with_open(ump, filename_vp, + attrnamespace, attrname, l); + ufs_extattr_uepm_unlock(ump); + return (error); + + case UFS_EXTATTR_CMD_DISABLE: + if (filename_vp != NULL) { + VOP_UNLOCK(filename_vp); + return (EINVAL); + } + if (attrname == NULL) + return (EINVAL); + + ufs_extattr_uepm_lock(ump); + error = ufs_extattr_disable(ump, attrnamespace, attrname, l); + ufs_extattr_uepm_unlock(ump); + return (error); + + default: + return (EINVAL); + } +} + +/* + * Read extended attribute header for a given vnode and attribute. + * Backing vnode should be locked and unlocked by caller. + */ +static int +ufs_extattr_get_header(struct vnode *vp, struct ufs_extattr_list_entry *uele, + struct ufs_extattr_header *ueh, off_t *bap) +{ + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + struct inode *ip = VTOI(vp); + off_t base_offset; + struct iovec aiov; + struct uio aio; + int error; + + /* + * Find base offset of header in file based on file header size, and + * data header size + maximum data size, indexed by inode number. + */ + base_offset = sizeof(struct ufs_extattr_fileheader) + + ip->i_number * (sizeof(struct ufs_extattr_header) + + uele->uele_fileheader.uef_size); + + /* + * Read in the data header to see if the data is defined, and if so + * how much. + */ + memset(ueh, 0, sizeof(struct ufs_extattr_header)); + aiov.iov_base = ueh; + aiov.iov_len = sizeof(struct ufs_extattr_header); + aio.uio_iov = &aiov; + aio.uio_iovcnt = 1; + aio.uio_rw = UIO_READ; + aio.uio_offset = base_offset; + aio.uio_resid = sizeof(struct ufs_extattr_header); + UIO_SETUP_SYSSPACE(&aio); + + error = VOP_READ(uele->uele_backing_vnode, &aio, + IO_NODELOCKED, ump->um_extattr.uepm_ucred); + if (error) + return error; + + /* + * Attribute headers are kept in file system byte order. + * XXX What about the blob of data? + */ + ueh->ueh_flags = ufs_rw32(ueh->ueh_flags, UELE_NEEDSWAP(uele)); + ueh->ueh_len = ufs_rw32(ueh->ueh_len, UELE_NEEDSWAP(uele)); + ueh->ueh_i_gen = ufs_rw32(ueh->ueh_i_gen, UELE_NEEDSWAP(uele)); + + /* Defined? */ + if ((ueh->ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) + return ENOATTR; + + /* Valid for the current inode generation? */ + if (ueh->ueh_i_gen != ip->i_gen) { + /* + * The inode itself has a different generation number + * than the uele data. For now, the best solution + * is to coerce this to undefined, and let it get cleaned + * up by the next write or extattrctl clean. + */ + printf("%s (%s): inode gen inconsistency (%u, %jd)\n", + __func__, mp->mnt_stat.f_mntonname, ueh->ueh_i_gen, + (intmax_t)ip->i_gen); + return ENOATTR; + } + + /* Local size consistency check. */ + if (ueh->ueh_len > uele->uele_fileheader.uef_size) + return ENXIO; + + /* Return base offset */ + if (bap != NULL) + *bap = base_offset; + + return 0; +} + +/* + * Vnode operation to retrieve a named extended attribute. + */ +int +ufs_getextattr(struct vop_getextattr_args *ap) +/* +vop_getextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN kauth_cred_t a_cred; +}; +*/ +{ + struct mount *mp = ap->a_vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + int error; + + ufs_extattr_uepm_lock(ump); + + error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name, + ap->a_uio, ap->a_size, ap->a_cred, curlwp); + + ufs_extattr_uepm_unlock(ump); + + return (error); +} + +/* + * Real work associated with retrieving a named attribute--assumes that + * the attribute lock has already been grabbed. + */ +static int +ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name, + struct uio *uio, size_t *size, kauth_cred_t cred, struct lwp *l) +{ + struct ufs_extattr_list_entry *attribute; + struct ufs_extattr_header ueh; + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + off_t base_offset; + size_t len, old_len; + int error = 0; + + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) + return (EOPNOTSUPP); + + if (strlen(name) == 0) + return (EINVAL); + + error = extattr_check_cred(vp, attrnamespace, cred, l, IREAD); + if (error) + return (error); + + attribute = ufs_extattr_find_attr(ump, attrnamespace, name); + if (!attribute) + return (ENOATTR); + + /* + * Allow only offsets of zero to encourage the read/replace + * extended attribute semantic. Otherwise we can't guarantee + * atomicity, as we don't provide locks for extended attributes. + */ + if (uio != NULL && uio->uio_offset != 0) + return (ENXIO); + + /* + * Don't need to get a lock on the backing file if the getattr is + * being applied to the backing file, as the lock is already held. + */ + if (attribute->uele_backing_vnode != vp) + vn_lock(attribute->uele_backing_vnode, LK_SHARED | LK_RETRY); + + error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset); + if (error) + goto vopunlock_exit; + + /* Return full data size if caller requested it. */ + if (size != NULL) + *size = ueh.ueh_len; + + /* Return data if the caller requested it. */ + if (uio != NULL) { + /* Allow for offset into the attribute data. */ + uio->uio_offset = base_offset + sizeof(struct + ufs_extattr_header); + + /* + * Figure out maximum to transfer -- use buffer size and + * local data limit. + */ + len = MIN(uio->uio_resid, ueh.ueh_len); + old_len = uio->uio_resid; + uio->uio_resid = len; + + error = VOP_READ(attribute->uele_backing_vnode, uio, + IO_NODELOCKED, ump->um_extattr.uepm_ucred); + if (error) + goto vopunlock_exit; + + uio->uio_resid = old_len - (len - uio->uio_resid); + } + + vopunlock_exit: + + if (uio != NULL) + uio->uio_offset = 0; + + if (attribute->uele_backing_vnode != vp) + VOP_UNLOCK(attribute->uele_backing_vnode); + + return (error); +} + +/* + * Vnode operation to list extended attribute for a vnode + */ +int +ufs_listextattr(struct vop_listextattr_args *ap) +/* +vop_listextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN int flag; + IN kauth_cred_t a_cred; + struct proc *a_p; +}; +*/ +{ + struct mount *mp = ap->a_vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + int error; + + ufs_extattr_uepm_lock(ump); + + error = ufs_extattr_list(ap->a_vp, ap->a_attrnamespace, + ap->a_uio, ap->a_size, ap->a_flag, ap->a_cred, curlwp); + + ufs_extattr_uepm_unlock(ump); + + return (error); +} + +/* + * Real work associated with retrieving list of attributes--assumes that + * the attribute lock has already been grabbed. + */ +static int +ufs_extattr_list(struct vnode *vp, int attrnamespace, + struct uio *uio, size_t *size, int flag, + kauth_cred_t cred, struct lwp *l) +{ + struct ufs_extattr_list_entry *uele; + struct ufs_extattr_header ueh; + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + size_t listsize = 0; + int error = 0; + + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) + return (EOPNOTSUPP); + + error = extattr_check_cred(vp, attrnamespace, cred, l, IREAD); + if (error) + return (error); + + LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) { + unsigned char attrnamelen; + + if (uele->uele_attrnamespace != attrnamespace) + continue; + + error = ufs_extattr_get_header(vp, uele, &ueh, NULL); + if (error == ENOATTR) + continue; + if (error != 0) + return error; + + /* + * Don't need to get a lock on the backing file if + * the listattr is being applied to the backing file, + * as the lock is already held. + */ + if (uele->uele_backing_vnode != vp) + vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY); + + /* + * +1 for trailing NUL (listxattr flavor) + * or leading name length (extattr_list_file flavor) + */ + attrnamelen = strlen(uele->uele_attrname); + listsize += attrnamelen + 1; + + /* Return data if the caller requested it. */ + if (uio != NULL) { + /* + * We support two flavors. Either NUL-terminated + * strings (a la listxattr), or non NUL-terminated, + * one byte length prefixed strings (for + * extattr_list_file). EXTATTR_LIST_LENPREFIX switches + * that second behavior. + */ + if (flag & EXTATTR_LIST_LENPREFIX) { + uint8_t len = (uint8_t)attrnamelen; + + /* Copy leading name length */ + error = uiomove(&len, sizeof(len), uio); + if (error != 0) + break; + } else { + /* Include trailing NULL */ + attrnamelen++; + } + + error = uiomove(uele->uele_attrname, + (size_t)attrnamelen, uio); + if (error != 0) + break; + } + + if (uele->uele_backing_vnode != vp) + VOP_UNLOCK(uele->uele_backing_vnode); + + if (error != 0) + return error; + } + + if (uio != NULL) + uio->uio_offset = 0; + + /* Return full data size if caller requested it. */ + if (size != NULL) + *size = listsize; + + return 0; +} + +/* + * Vnode operation to remove a named attribute. + */ +int +ufs_deleteextattr(struct vop_deleteextattr_args *ap) +/* +vop_deleteextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + IN kauth_cred_t a_cred; +}; +*/ +{ + struct mount *mp = ap->a_vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + int error; + + ufs_extattr_uepm_lock(ump); + + error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name, + ap->a_cred, curlwp); + + ufs_extattr_uepm_unlock(ump); + + return (error); +} + +/* + * Vnode operation to set a named attribute. + */ +int +ufs_setextattr(struct vop_setextattr_args *ap) +/* +vop_setextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + IN kauth_cred_t a_cred; +}; +*/ +{ + struct mount *mp = ap->a_vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + int error; + + ufs_extattr_uepm_lock(ump); + + /* + * XXX: No longer a supported way to delete extended attributes. + */ + if (ap->a_uio == NULL) { + ufs_extattr_uepm_unlock(ump); + return (EINVAL); + } + + error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name, + ap->a_uio, ap->a_cred, curlwp); + + ufs_extattr_uepm_unlock(ump); + + return (error); +} + +/* + * Real work associated with setting a vnode's extended attributes; + * assumes that the attribute lock has already been grabbed. + */ +static int +ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name, + struct uio *uio, kauth_cred_t cred, struct lwp *l) +{ + struct ufs_extattr_list_entry *attribute; + struct ufs_extattr_header ueh; + struct iovec local_aiov; + struct uio local_aio; + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + struct inode *ip = VTOI(vp); + off_t base_offset; + int error = 0, ioflag; + + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) + return (EOPNOTSUPP); + if (!ufs_extattr_valid_attrname(attrnamespace, name)) + return (EINVAL); + + error = extattr_check_cred(vp, attrnamespace, cred, l, IWRITE); + if (error) + return (error); + + attribute = ufs_extattr_find_attr(ump, attrnamespace, name); + if (!attribute) { + attribute = ufs_extattr_autocreate_attr(vp, attrnamespace, + name, l); + if (!attribute) + return (ENOATTR); + } + + /* + * Early rejection of invalid offsets/length. + * Reject: any offset but 0 (replace) + * Any size greater than attribute size limit + */ + if (uio->uio_offset != 0 || + uio->uio_resid > attribute->uele_fileheader.uef_size) + return (ENXIO); + + /* + * Find base offset of header in file based on file header size, and + * data header size + maximum data size, indexed by inode number. + */ + base_offset = sizeof(struct ufs_extattr_fileheader) + + ip->i_number * (sizeof(struct ufs_extattr_header) + + attribute->uele_fileheader.uef_size); + + /* + * Write out a data header for the data. + */ + ueh.ueh_len = ufs_rw32((uint32_t) uio->uio_resid, + UELE_NEEDSWAP(attribute)); + ueh.ueh_flags = ufs_rw32(UFS_EXTATTR_ATTR_FLAG_INUSE, + UELE_NEEDSWAP(attribute)); + ueh.ueh_i_gen = ufs_rw32(ip->i_gen, UELE_NEEDSWAP(attribute)); + local_aiov.iov_base = &ueh; + local_aiov.iov_len = sizeof(struct ufs_extattr_header); + local_aio.uio_iov = &local_aiov; + local_aio.uio_iovcnt = 1; + local_aio.uio_rw = UIO_WRITE; + local_aio.uio_offset = base_offset; + local_aio.uio_resid = sizeof(struct ufs_extattr_header); + UIO_SETUP_SYSSPACE(&local_aio); + + /* + * Don't need to get a lock on the backing file if the setattr is + * being applied to the backing file, as the lock is already held. + */ + if (attribute->uele_backing_vnode != vp) + vn_lock(attribute->uele_backing_vnode, + LK_EXCLUSIVE | LK_RETRY); + + ioflag = IO_NODELOCKED; + if (ufs_extattr_sync) + ioflag |= IO_SYNC; + error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag, + ump->um_extattr.uepm_ucred); + if (error) + goto vopunlock_exit; + + if (local_aio.uio_resid != 0) { + error = ENXIO; + goto vopunlock_exit; + } + + /* + * Write out user data. + * XXX NOT ATOMIC WITH RESPECT TO THE HEADER. + */ + uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header); + + ioflag = IO_NODELOCKED; + if (ufs_extattr_sync) + ioflag |= IO_SYNC; + error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag, + ump->um_extattr.uepm_ucred); + + vopunlock_exit: + uio->uio_offset = 0; + + if (attribute->uele_backing_vnode != vp) + VOP_UNLOCK(attribute->uele_backing_vnode); + + return (error); +} + +/* + * Real work associated with removing an extended attribute from a vnode. + * Assumes the attribute lock has already been grabbed. + */ +static int +ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name, + kauth_cred_t cred, struct lwp *l) +{ + struct ufs_extattr_list_entry *attribute; + struct ufs_extattr_header ueh; + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + struct iovec local_aiov; + struct uio local_aio; + off_t base_offset; + int error = 0, ioflag; + + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) + return (EOPNOTSUPP); + if (!ufs_extattr_valid_attrname(attrnamespace, name)) + return (EINVAL); + + error = extattr_check_cred(vp, attrnamespace, cred, l, IWRITE); + if (error) + return (error); + + attribute = ufs_extattr_find_attr(ump, attrnamespace, name); + if (!attribute) + return (ENOATTR); + + /* + * Don't need to get a lock on the backing file if the getattr is + * being applied to the backing file, as the lock is already held. + */ + if (attribute->uele_backing_vnode != vp) + vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY); + + error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset); + if (error) + goto vopunlock_exit; + + /* Flag it as not in use. */ + ueh.ueh_flags = 0; /* No need to byte swap 0 */ + ueh.ueh_len = 0; /* ...ditto... */ + + local_aiov.iov_base = &ueh; + local_aiov.iov_len = sizeof(struct ufs_extattr_header); + local_aio.uio_iov = &local_aiov; + local_aio.uio_iovcnt = 1; + local_aio.uio_rw = UIO_WRITE; + local_aio.uio_offset = base_offset; + local_aio.uio_resid = sizeof(struct ufs_extattr_header); + UIO_SETUP_SYSSPACE(&local_aio); + + ioflag = IO_NODELOCKED; + if (ufs_extattr_sync) + ioflag |= IO_SYNC; + error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag, + ump->um_extattr.uepm_ucred); + if (error) + goto vopunlock_exit; + + if (local_aio.uio_resid != 0) + error = ENXIO; + + vopunlock_exit: + VOP_UNLOCK(attribute->uele_backing_vnode); + + return (error); +} + +/* + * Called by UFS when an inode is no longer active and should have its + * attributes stripped. + */ +void +ufs_extattr_vnode_inactive(struct vnode *vp, struct lwp *l) +{ + struct ufs_extattr_list_entry *uele; + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + + /* + * In that case, we cannot lock. We should not have any active vnodes + * on the fs if this is not yet initialized but is going to be, so + * this can go unlocked. + */ + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) + return; + + ufs_extattr_uepm_lock(ump); + + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { + ufs_extattr_uepm_unlock(ump); + return; + } + + LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) + ufs_extattr_rm(vp, uele->uele_attrnamespace, + uele->uele_attrname, lwp0.l_cred, l); + + ufs_extattr_uepm_unlock(ump); +} + +void +ufs_extattr_init(void) +{ + + malloc_type_attach(M_UFS_EXTATTR); +} + +void +ufs_extattr_done(void) +{ + + malloc_type_detach(M_UFS_EXTATTR); +} diff --git a/include/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h similarity index 100% rename from include/ufs/ufs/ufs_extern.h rename to sys/ufs/ufs/ufs_extern.h diff --git a/sys/ufs/ufs/ufs_ihash.c b/sys/ufs/ufs/ufs_ihash.c new file mode 100644 index 000000000..213f3357d --- /dev/null +++ b/sys/ufs/ufs/ufs_ihash.c @@ -0,0 +1,191 @@ +/* $NetBSD: ufs_ihash.c,v 1.31 2011/06/12 03:36:02 rmind Exp $ */ + +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_ihash.c 8.7 (Berkeley) 5/17/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ufs_ihash.c,v 1.31 2011/06/12 03:36:02 rmind Exp $"); + +#include +#include +#include +#include +#include + +#include +#include + +/* + * Structures associated with inode cacheing. + */ +static LIST_HEAD(ihashhead, inode) *ihashtbl; +static u_long ihash; /* size of hash table - 1 */ +#define INOHASH(device, inum) (((device) + (inum)) & ihash) + +kmutex_t ufs_ihash_lock; +kmutex_t ufs_hashlock; + +/* + * Initialize inode hash table. + */ +void +ufs_ihashinit(void) +{ + + mutex_init(&ufs_hashlock, MUTEX_DEFAULT, IPL_NONE); + mutex_init(&ufs_ihash_lock, MUTEX_DEFAULT, IPL_NONE); + ihashtbl = hashinit(desiredvnodes, HASH_LIST, true, &ihash); +} + +/* + * Reinitialize inode hash table. + */ + +void +ufs_ihashreinit(void) +{ + struct inode *ip; + struct ihashhead *oldhash, *hash; + u_long oldmask, mask, val; + int i; + + hash = hashinit(desiredvnodes, HASH_LIST, true, &mask); + mutex_enter(&ufs_ihash_lock); + oldhash = ihashtbl; + oldmask = ihash; + ihashtbl = hash; + ihash = mask; + for (i = 0; i <= oldmask; i++) { + while ((ip = LIST_FIRST(&oldhash[i])) != NULL) { + LIST_REMOVE(ip, i_hash); + val = INOHASH(ip->i_dev, ip->i_number); + LIST_INSERT_HEAD(&hash[val], ip, i_hash); + } + } + mutex_exit(&ufs_ihash_lock); + hashdone(oldhash, HASH_LIST, oldmask); +} + +/* + * Free inode hash table. + */ +void +ufs_ihashdone(void) +{ + + hashdone(ihashtbl, HASH_LIST, ihash); + mutex_destroy(&ufs_hashlock); + mutex_destroy(&ufs_ihash_lock); +} + +/* + * Use the device/inum pair to find the incore inode, and return a pointer + * to it. If it is in core, return it, even if it is locked. + */ +struct vnode * +ufs_ihashlookup(dev_t dev, ino_t inum) +{ + struct inode *ip; + struct ihashhead *ipp; + + KASSERT(mutex_owned(&ufs_ihash_lock)); + + ipp = &ihashtbl[INOHASH(dev, inum)]; + LIST_FOREACH(ip, ipp, i_hash) { + if (inum == ip->i_number && dev == ip->i_dev) + break; + } + if (ip) + return (ITOV(ip)); + return (NULLVP); +} + +/* + * Use the device/inum pair to find the incore inode, and return a pointer + * to it. If it is in core, but locked, wait for it. + */ +struct vnode * +ufs_ihashget(dev_t dev, ino_t inum, int flags) +{ + struct ihashhead *ipp; + struct inode *ip; + struct vnode *vp; + + loop: + mutex_enter(&ufs_ihash_lock); + ipp = &ihashtbl[INOHASH(dev, inum)]; + LIST_FOREACH(ip, ipp, i_hash) { + if (inum == ip->i_number && dev == ip->i_dev) { + vp = ITOV(ip); + if (flags == 0) { + mutex_exit(&ufs_ihash_lock); + } else { + mutex_enter(vp->v_interlock); + mutex_exit(&ufs_ihash_lock); + if (vget(vp, flags)) + goto loop; + } + return (vp); + } + } + mutex_exit(&ufs_ihash_lock); + return (NULL); +} + +/* + * Insert the inode into the hash table, and return it locked. + */ +void +ufs_ihashins(struct inode *ip) +{ + struct ihashhead *ipp; + + KASSERT(mutex_owned(&ufs_hashlock)); + + /* lock the inode, then put it on the appropriate hash list */ + VOP_LOCK(ITOV(ip), LK_EXCLUSIVE); + + mutex_enter(&ufs_ihash_lock); + ipp = &ihashtbl[INOHASH(ip->i_dev, ip->i_number)]; + LIST_INSERT_HEAD(ipp, ip, i_hash); + mutex_exit(&ufs_ihash_lock); +} + +/* + * Remove the inode from the hash table. + */ +void +ufs_ihashrem(struct inode *ip) +{ + mutex_enter(&ufs_ihash_lock); + LIST_REMOVE(ip, i_hash); + mutex_exit(&ufs_ihash_lock); +} diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c new file mode 100644 index 000000000..7a9eea4ff --- /dev/null +++ b/sys/ufs/ufs/ufs_inode.c @@ -0,0 +1,311 @@ +/* $NetBSD: ufs_inode.c,v 1.88 2011/09/20 14:01:33 chs Exp $ */ + +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_inode.c 8.9 (Berkeley) 5/14/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.88 2011/09/20 14:01:33 chs Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_ffs.h" +#include "opt_quota.h" +#include "opt_wapbl.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#ifdef UFS_DIRHASH +#include +#endif +#ifdef UFS_EXTATTR +#include +#endif + +#include + +extern int prtactive; + +/* + * Last reference to an inode. If necessary, write or delete it. + */ +int +ufs_inactive(void *v) +{ + struct vop_inactive_args /* { + struct vnode *a_vp; + struct bool *a_recycle; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct mount *transmp; + mode_t mode; + int error = 0; + int logged = 0; + + UFS_WAPBL_JUNLOCK_ASSERT(vp->v_mount); + + transmp = vp->v_mount; + fstrans_start(transmp, FSTRANS_LAZY); + /* + * Ignore inodes related to stale file handles. + */ + if (ip->i_mode == 0) + goto out; + if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { +#ifdef UFS_EXTATTR + ufs_extattr_vnode_inactive(vp, curlwp); +#endif + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; + logged = 1; + if (ip->i_size != 0) { + /* + * When journaling, only truncate one indirect block + * at a time + */ + if (vp->v_mount->mnt_wapbl) { + uint64_t incr = MNINDIR(ip->i_ump) << + vp->v_mount->mnt_fs_bshift; /* Power of 2 */ + uint64_t base = NDADDR << + vp->v_mount->mnt_fs_bshift; + while (!error && ip->i_size > base + incr) { + /* + * round down to next full indirect + * block boundary. + */ + uint64_t nsize = base + + ((ip->i_size - base - 1) & + ~(incr - 1)); + error = UFS_TRUNCATE(vp, nsize, 0, + NOCRED); + if (error) + break; + UFS_WAPBL_END(vp->v_mount); + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; + } + } + if (!error) + error = UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED); + } +#if defined(QUOTA) || defined(QUOTA2) + (void)chkiq(ip, -1, NOCRED, 0); +#endif + DIP_ASSIGN(ip, rdev, 0); + mode = ip->i_mode; + ip->i_mode = 0; + ip->i_omode = mode; + DIP_ASSIGN(ip, mode, 0); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * Defer final inode free and update to ufs_reclaim(). + */ + } + + if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) { + if (!logged++) { + int err; + err = UFS_WAPBL_BEGIN(vp->v_mount); + if (err) + goto out; + } + UFS_UPDATE(vp, NULL, NULL, 0); + } + if (logged) + UFS_WAPBL_END(vp->v_mount); +out: + /* + * If we are done with the inode, reclaim it + * so that it can be reused immediately. + */ + *ap->a_recycle = (ip->i_mode == 0); + VOP_UNLOCK(vp); + fstrans_done(transmp); + return (error); +} + +/* + * Reclaim an inode so that it can be used for other purposes. + */ +int +ufs_reclaim(struct vnode *vp) +{ + struct inode *ip = VTOI(vp); + + if (prtactive && vp->v_usecount > 1) + vprint("ufs_reclaim: pushing active", vp); + + if (!UFS_WAPBL_BEGIN(vp->v_mount)) { + UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE); + UFS_WAPBL_END(vp->v_mount); + } + UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE); + + /* + * Remove the inode from its hash chain. + */ + ufs_ihashrem(ip); + + if (ip->i_devvp) { + vrele(ip->i_devvp); + ip->i_devvp = 0; + } +#if defined(QUOTA) || defined(QUOTA2) + ufsquota_free(ip); +#endif +#ifdef UFS_DIRHASH + if (ip->i_dirhash != NULL) + ufsdirhash_free(ip); +#endif + return (0); +} + +/* + * allocate a range of blocks in a file. + * after this function returns, any page entirely contained within the range + * will map to invalid data and thus must be overwritten before it is made + * accessible to others. + */ + +int +ufs_balloc_range(struct vnode *vp, off_t off, off_t len, kauth_cred_t cred, + int flags) +{ + off_t neweof; /* file size after the operation */ + off_t neweob; /* offset next to the last block after the operation */ + off_t pagestart; /* starting offset of range covered by pgs */ + off_t eob; /* offset next to allocated blocks */ + struct uvm_object *uobj; + int i, delta, error, npages; + int bshift = vp->v_mount->mnt_fs_bshift; + int bsize = 1 << bshift; + int ppb = MAX(bsize >> PAGE_SHIFT, 1); + struct vm_page **pgs; + size_t pgssize; + UVMHIST_FUNC("ufs_balloc_range"); UVMHIST_CALLED(ubchist); + UVMHIST_LOG(ubchist, "vp %p off 0x%x len 0x%x u_size 0x%x", + vp, off, len, vp->v_size); + + neweof = MAX(vp->v_size, off + len); + GOP_SIZE(vp, neweof, &neweob, 0); + + error = 0; + uobj = &vp->v_uobj; + + /* + * read or create pages covering the range of the allocation and + * keep them locked until the new block is allocated, so there + * will be no window where the old contents of the new block are + * visible to racing threads. + */ + + pagestart = trunc_page(off) & ~(bsize - 1); + npages = MIN(ppb, (round_page(neweob) - pagestart) >> PAGE_SHIFT); + pgssize = npages * sizeof(struct vm_page *); + pgs = kmem_zalloc(pgssize, KM_SLEEP); + + /* + * adjust off to be block-aligned. + */ + + delta = off & (bsize - 1); + off -= delta; + len += delta; + + genfs_node_wrlock(vp); + mutex_enter(uobj->vmobjlock); + error = VOP_GETPAGES(vp, pagestart, pgs, &npages, 0, + VM_PROT_WRITE, 0, PGO_SYNCIO | PGO_PASTEOF | PGO_NOBLOCKALLOC | + PGO_NOTIMESTAMP | PGO_GLOCKHELD); + if (error) { + goto out; + } + + /* + * now allocate the range. + */ + + error = GOP_ALLOC(vp, off, len, flags, cred); + genfs_node_unlock(vp); + + /* + * if the allocation succeeded, clear PG_CLEAN on all the pages + * and clear PG_RDONLY on any pages that are now fully backed + * by disk blocks. if the allocation failed, we do not invalidate + * the pages since they might have already existed and been dirty, + * in which case we need to keep them around. if we created the pages, + * they will be clean and read-only, and leaving such pages + * in the cache won't cause any problems. + */ + + GOP_SIZE(vp, off + len, &eob, 0); + mutex_enter(uobj->vmobjlock); + mutex_enter(&uvm_pageqlock); + for (i = 0; i < npages; i++) { + KASSERT((pgs[i]->flags & PG_RELEASED) == 0); + if (!error) { + if (off <= pagestart + (i << PAGE_SHIFT) && + pagestart + ((i + 1) << PAGE_SHIFT) <= eob) { + pgs[i]->flags &= ~PG_RDONLY; + } + pgs[i]->flags &= ~PG_CLEAN; + } + uvm_pageactivate(pgs[i]); + } + mutex_exit(&uvm_pageqlock); + uvm_page_unbusy(pgs, npages); + mutex_exit(uobj->vmobjlock); + + out: + kmem_free(pgs, pgssize); + return error; +} diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c new file mode 100644 index 000000000..aa395de18 --- /dev/null +++ b/sys/ufs/ufs/ufs_lookup.c @@ -0,0 +1,1500 @@ +/* $NetBSD: ufs_lookup.c,v 1.111 2011/07/17 22:07:59 dholland Exp $ */ + +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_lookup.c 8.9 (Berkeley) 8/11/94 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.111 2011/07/17 22:07:59 dholland Exp $"); + +#ifdef _KERNEL_OPT +#include "opt_ffs.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#ifdef UFS_DIRHASH +#include +#endif +#include +#include +#include +#include + +#ifdef DIAGNOSTIC +int dirchk = 1; +#else +int dirchk = 0; +#endif + +#define FSFMT(vp) (((vp)->v_mount->mnt_iflag & IMNT_DTYPE) == 0) + +/* + * Convert a component of a pathname into a pointer to a locked inode. + * This is a very central and rather complicated routine. + * If the file system is not maintained in a strict tree hierarchy, + * this can result in a deadlock situation (see comments in code below). + * + * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending + * on whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it and the target of the pathname + * exists, lookup returns both the target and its parent directory locked. + * When creating or renaming and LOCKPARENT is specified, the target may + * not be ".". When deleting and LOCKPARENT is specified, the target may + * be "."., but the caller must check to ensure it does an vrele and vput + * instead of two vputs. + * + * Overall outline of ufs_lookup: + * + * check accessibility of directory + * look for name in cache, if found, then if at end of path + * and deleting or creating, drop it, else return name + * search for name in directory, to found or notfound + * notfound: + * if creating, return locked directory, leaving info on available slots + * else return error + * found: + * if at end of path and deleting, return information to allow delete + * if at end of path and rewriting (RENAME and LOCKPARENT), lock target + * inode and return info to allow rewrite + * if not at end, add name to cache; if at end and neither creating + * nor deleting, add name to cache + */ +int +ufs_lookup(void *v) +{ + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap = v; + struct vnode *vdp = ap->a_dvp; /* vnode for directory being searched */ + struct inode *dp = VTOI(vdp); /* inode for directory being searched */ + struct buf *bp; /* a buffer of directory entries */ + struct direct *ep; /* the current directory entry */ + int entryoffsetinblock; /* offset of ep in bp's buffer */ + enum {NONE, COMPACT, FOUND} slotstatus; + doff_t slotoffset; /* offset of area with free space */ + int slotsize; /* size of area at slotoffset */ + int slotfreespace; /* amount of space free in slot */ + int slotneeded; /* size of the entry we're seeking */ + int numdirpasses; /* strategy for directory search */ + doff_t endsearch; /* offset to end directory search */ + doff_t prevoff; /* prev entry dp->i_offset */ + struct vnode *pdp; /* saved dp during symlink work */ + struct vnode *tdp; /* returned by VFS_VGET */ + doff_t enduseful; /* pointer past last used dir slot */ + u_long bmask; /* block offset mask */ + int namlen, error; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + kauth_cred_t cred = cnp->cn_cred; + int flags; + int nameiop = cnp->cn_nameiop; + struct ufsmount *ump = dp->i_ump; + const int needswap = UFS_MPNEEDSWAP(ump); + int dirblksiz = ump->um_dirblksiz; + ino_t foundino; + struct ufs_lookup_results *results; + + flags = cnp->cn_flags; + + bp = NULL; + slotoffset = -1; + *vpp = NULL; + endsearch = 0; /* silence compiler warning */ + + /* + * Produce the auxiliary lookup results into i_crap. Increment + * its serial number so elsewhere we can tell if we're using + * stale results. This should not be done this way. XXX. + */ + results = &dp->i_crap; + dp->i_crapcounter++; + + /* + * Check accessiblity of directory. + */ + if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0) + return (error); + + if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) && + (nameiop == DELETE || nameiop == RENAME)) + return (EROFS); + + /* + * We now have a segment name to search for, and a directory to search. + * + * Before tediously performing a linear scan of the directory, + * check the name cache to see if the directory/name pair + * we are looking for is known already. + */ + if ((error = cache_lookup(vdp, vpp, cnp)) >= 0) { + return (error); + } + + fstrans_start(vdp->v_mount, FSTRANS_SHARED); + + /* + * Suppress search for slots unless creating + * file and at end of pathname, in which case + * we watch for a place to put the new file in + * case it doesn't already exist. + */ + slotstatus = FOUND; + slotfreespace = slotsize = slotneeded = 0; + if ((nameiop == CREATE || nameiop == RENAME) && + (flags & ISLASTCN)) { + slotstatus = NONE; + slotneeded = DIRECTSIZ(cnp->cn_namelen); + } + + /* + * If there is cached information on a previous search of + * this directory, pick up where we last left off. + * We cache only lookups as these are the most common + * and have the greatest payoff. Caching CREATE has little + * benefit as it usually must search the entire directory + * to determine that the entry does not exist. Caching the + * location of the last DELETE or RENAME has not reduced + * profiling time and hence has been removed in the interest + * of simplicity. + */ + bmask = vdp->v_mount->mnt_stat.f_iosize - 1; + +#ifdef UFS_DIRHASH + /* + * Use dirhash for fast operations on large directories. The logic + * to determine whether to hash the directory is contained within + * ufsdirhash_build(); a zero return means that it decided to hash + * this directory and it successfully built up the hash table. + */ + if (ufsdirhash_build(dp) == 0) { + /* Look for a free slot if needed. */ + enduseful = dp->i_size; + if (slotstatus != FOUND) { + slotoffset = ufsdirhash_findfree(dp, slotneeded, + &slotsize); + if (slotoffset >= 0) { + slotstatus = COMPACT; + enduseful = ufsdirhash_enduseful(dp); + if (enduseful < 0) + enduseful = dp->i_size; + } + } + /* Look up the component. */ + numdirpasses = 1; + entryoffsetinblock = 0; /* silence compiler warning */ + switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen, + &results->ulr_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) { + case 0: + ep = (struct direct *)((char *)bp->b_data + + (results->ulr_offset & bmask)); + goto foundentry; + case ENOENT: + results->ulr_offset = roundup(dp->i_size, dirblksiz); + goto notfound; + default: + /* Something failed; just do a linear search. */ + break; + } + } +#endif /* UFS_DIRHASH */ + + if (nameiop != LOOKUP || results->ulr_diroff == 0 || + results->ulr_diroff >= dp->i_size) { + entryoffsetinblock = 0; + results->ulr_offset = 0; + numdirpasses = 1; + } else { + results->ulr_offset = results->ulr_diroff; + if ((entryoffsetinblock = results->ulr_offset & bmask) && + (error = ufs_blkatoff(vdp, (off_t)results->ulr_offset, + NULL, &bp, false))) + goto out; + numdirpasses = 2; + nchstats.ncs_2passes++; + } + prevoff = results->ulr_offset; + endsearch = roundup(dp->i_size, dirblksiz); + enduseful = 0; + +searchloop: + while (results->ulr_offset < endsearch) { + if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) + preempt(); + /* + * If necessary, get the next directory block. + */ + if ((results->ulr_offset & bmask) == 0) { + if (bp != NULL) + brelse(bp, 0); + error = ufs_blkatoff(vdp, (off_t)results->ulr_offset, NULL, + &bp, false); + if (error) + goto out; + entryoffsetinblock = 0; + } + /* + * If still looking for a slot, and at a DIRBLKSIZ + * boundary, have to start looking for free space again. + */ + if (slotstatus == NONE && + (entryoffsetinblock & (dirblksiz - 1)) == 0) { + slotoffset = -1; + slotfreespace = 0; + } + /* + * Get pointer to next entry. + * Full validation checks are slow, so we only check + * enough to insure forward progress through the + * directory. Complete checks can be run by patching + * "dirchk" to be true. + */ + KASSERT(bp != NULL); + ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock); + if (ep->d_reclen == 0 || + (dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock))) { + int i; + + ufs_dirbad(dp, results->ulr_offset, "mangled entry"); + i = dirblksiz - (entryoffsetinblock & (dirblksiz - 1)); + results->ulr_offset += i; + entryoffsetinblock += i; + continue; + } + + /* + * If an appropriate sized slot has not yet been found, + * check to see if one is available. Also accumulate space + * in the current block so that we can determine if + * compaction is viable. + */ + if (slotstatus != FOUND) { + int size = ufs_rw16(ep->d_reclen, needswap); + + if (ep->d_ino != 0) + size -= DIRSIZ(FSFMT(vdp), ep, needswap); + if (size > 0) { + if (size >= slotneeded) { + slotstatus = FOUND; + slotoffset = results->ulr_offset; + slotsize = ufs_rw16(ep->d_reclen, + needswap); + } else if (slotstatus == NONE) { + slotfreespace += size; + if (slotoffset == -1) + slotoffset = results->ulr_offset; + if (slotfreespace >= slotneeded) { + slotstatus = COMPACT; + slotsize = results->ulr_offset + + ufs_rw16(ep->d_reclen, + needswap) - + slotoffset; + } + } + } + } + + /* + * Check for a name match. + */ + if (ep->d_ino) { +#if (BYTE_ORDER == LITTLE_ENDIAN) + if (FSFMT(vdp) && needswap == 0) + namlen = ep->d_type; + else + namlen = ep->d_namlen; +#else + if (FSFMT(vdp) && needswap != 0) + namlen = ep->d_type; + else + namlen = ep->d_namlen; +#endif + if (namlen == cnp->cn_namelen && + !memcmp(cnp->cn_nameptr, ep->d_name, + (unsigned)namlen)) { +#ifdef UFS_DIRHASH +foundentry: +#endif + /* + * Save directory entry's inode number and + * reclen in ndp->ni_ufs area, and release + * directory buffer. + */ + if (!FSFMT(vdp) && ep->d_type == DT_WHT) { + slotstatus = FOUND; + slotoffset = results->ulr_offset; + slotsize = ufs_rw16(ep->d_reclen, + needswap); + results->ulr_reclen = slotsize; + /* + * This is used to set results->ulr_endoff, + * which may be used by ufs_direnter2() + * as a length to truncate the + * directory to. Therefore, it must + * point past the end of the last + * non-empty directory entry. We don't + * know where that is in this case, so + * we effectively disable shrinking by + * using the existing size of the + * directory. + * + * Note that we wouldn't expect to + * shrink the directory while rewriting + * an existing entry anyway. + */ + enduseful = endsearch; + ap->a_cnp->cn_flags |= ISWHITEOUT; + numdirpasses--; + goto notfound; + } + foundino = ufs_rw32(ep->d_ino, needswap); + results->ulr_reclen = ufs_rw16(ep->d_reclen, needswap); + goto found; + } + } + prevoff = results->ulr_offset; + results->ulr_offset += ufs_rw16(ep->d_reclen, needswap); + entryoffsetinblock += ufs_rw16(ep->d_reclen, needswap); + if (ep->d_ino) + enduseful = results->ulr_offset; + } +notfound: + /* + * If we started in the middle of the directory and failed + * to find our target, we must check the beginning as well. + */ + if (numdirpasses == 2) { + numdirpasses--; + results->ulr_offset = 0; + endsearch = results->ulr_diroff; + goto searchloop; + } + if (bp != NULL) + brelse(bp, 0); + /* + * If creating, and at end of pathname and current + * directory has not been removed, then can consider + * allowing file to be created. + */ + if ((nameiop == CREATE || nameiop == RENAME || + (nameiop == DELETE && + (ap->a_cnp->cn_flags & DOWHITEOUT) && + (ap->a_cnp->cn_flags & ISWHITEOUT))) && + (flags & ISLASTCN) && dp->i_nlink != 0) { + /* + * Access for write is interpreted as allowing + * creation of files in the directory. + */ + error = VOP_ACCESS(vdp, VWRITE, cred); + if (error) + goto out; + /* + * Return an indication of where the new directory + * entry should be put. If we didn't find a slot, + * then set results->ulr_count to 0 indicating + * that the new slot belongs at the end of the + * directory. If we found a slot, then the new entry + * can be put in the range from results->ulr_offset to + * results->ulr_offset + results->ulr_count. + */ + if (slotstatus == NONE) { + results->ulr_offset = roundup(dp->i_size, dirblksiz); + results->ulr_count = 0; + enduseful = results->ulr_offset; + } else if (nameiop == DELETE) { + results->ulr_offset = slotoffset; + if ((results->ulr_offset & (dirblksiz - 1)) == 0) + results->ulr_count = 0; + else + results->ulr_count = results->ulr_offset - prevoff; + } else { + results->ulr_offset = slotoffset; + results->ulr_count = slotsize; + if (enduseful < slotoffset + slotsize) + enduseful = slotoffset + slotsize; + } + results->ulr_endoff = roundup(enduseful, dirblksiz); +#if 0 /* commented out by dbj. none of the on disk fields changed */ + dp->i_flag |= IN_CHANGE | IN_UPDATE; +#endif + /* + * We return with the directory locked, so that + * the parameters we set up above will still be + * valid if we actually decide to do a direnter(). + * We return ni_vp == NULL to indicate that the entry + * does not currently exist; we leave a pointer to + * the (locked) directory inode in ndp->ni_dvp. + * + * NB - if the directory is unlocked, then this + * information cannot be used. + */ + error = EJUSTRETURN; + goto out; + } + /* + * Insert name into cache (as non-existent) if appropriate. + */ + if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) + cache_enter(vdp, *vpp, cnp); + error = ENOENT; + goto out; + +found: + if (numdirpasses == 2) + nchstats.ncs_pass2++; + /* + * Check that directory length properly reflects presence + * of this entry. + */ + if (results->ulr_offset + DIRSIZ(FSFMT(vdp), ep, needswap) > dp->i_size) { + ufs_dirbad(dp, results->ulr_offset, "i_size too small"); + dp->i_size = results->ulr_offset + DIRSIZ(FSFMT(vdp), ep, needswap); + DIP_ASSIGN(dp, size, dp->i_size); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP); + } + brelse(bp, 0); + + /* + * Found component in pathname. + * If the final component of path name, save information + * in the cache as to where the entry was found. + */ + if ((flags & ISLASTCN) && nameiop == LOOKUP) + results->ulr_diroff = results->ulr_offset &~ (dirblksiz - 1); + + /* + * If deleting, and at end of pathname, return + * parameters which can be used to remove file. + * Lock the inode, being careful with ".". + */ + if (nameiop == DELETE && (flags & ISLASTCN)) { + /* + * Write access to directory required to delete files. + */ + error = VOP_ACCESS(vdp, VWRITE, cred); + if (error) + goto out; + /* + * Return pointer to current entry in results->ulr_offset, + * and distance past previous entry (if there + * is a previous entry in this block) in results->ulr_count. + * Save directory inode pointer in ndp->ni_dvp for dirremove(). + */ + if ((results->ulr_offset & (dirblksiz - 1)) == 0) + results->ulr_count = 0; + else + results->ulr_count = results->ulr_offset - prevoff; + if (dp->i_number == foundino) { + vref(vdp); + *vpp = vdp; + error = 0; + goto out; + } + if (flags & ISDOTDOT) + VOP_UNLOCK(vdp); /* race to get the inode */ + error = VFS_VGET(vdp->v_mount, foundino, &tdp); + if (flags & ISDOTDOT) + vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY); + if (error) + goto out; + /* + * If directory is "sticky", then user must own + * the directory, or the file in it, else she + * may not delete it (unless she's root). This + * implements append-only directories. + */ + if ((dp->i_mode & ISVTX) && + kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, + NULL) != 0 && + kauth_cred_geteuid(cred) != dp->i_uid && + VTOI(tdp)->i_uid != kauth_cred_geteuid(cred)) { + vput(tdp); + error = EPERM; + goto out; + } + *vpp = tdp; + error = 0; + goto out; + } + + /* + * If rewriting (RENAME), return the inode and the + * information required to rewrite the present directory + * Must get inode of directory entry to verify it's a + * regular file, or empty directory. + */ + if (nameiop == RENAME && (flags & ISLASTCN)) { + error = VOP_ACCESS(vdp, VWRITE, cred); + if (error) + goto out; + /* + * Careful about locking second inode. + * This can only occur if the target is ".". + */ + if (dp->i_number == foundino) { + error = EISDIR; + goto out; + } + if (flags & ISDOTDOT) + VOP_UNLOCK(vdp); /* race to get the inode */ + error = VFS_VGET(vdp->v_mount, foundino, &tdp); + if (flags & ISDOTDOT) + vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY); + if (error) + goto out; + *vpp = tdp; + error = 0; + goto out; + } + + /* + * Step through the translation in the name. We do not `vput' the + * directory because we may need it again if a symbolic link + * is relative to the current directory. Instead we save it + * unlocked as "pdp". We must get the target inode before unlocking + * the directory to insure that the inode will not be removed + * before we get it. We prevent deadlock by always fetching + * inodes from the root, moving down the directory tree. Thus + * when following backward pointers ".." we must unlock the + * parent directory before getting the requested directory. + * There is a potential race condition here if both the current + * and parent directories are removed before the VFS_VGET for the + * inode associated with ".." returns. We hope that this occurs + * infrequently since we cannot avoid this race condition without + * implementing a sophisticated deadlock detection algorithm. + * Note also that this simple deadlock detection scheme will not + * work if the file system has any hard links other than ".." + * that point backwards in the directory structure. + */ + pdp = vdp; + if (flags & ISDOTDOT) { + VOP_UNLOCK(pdp); /* race to get the inode */ + error = VFS_VGET(vdp->v_mount, foundino, &tdp); + vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY); + if (error) { + goto out; + } + *vpp = tdp; + } else if (dp->i_number == foundino) { + vref(vdp); /* we want ourself, ie "." */ + *vpp = vdp; + } else { + error = VFS_VGET(vdp->v_mount, foundino, &tdp); + if (error) + goto out; + *vpp = tdp; + } + + /* + * Insert name into cache if appropriate. + */ + if (cnp->cn_flags & MAKEENTRY) + cache_enter(vdp, *vpp, cnp); + error = 0; + +out: + fstrans_done(vdp->v_mount); + return error; +} + +void +ufs_dirbad(struct inode *ip, doff_t offset, const char *how) +{ + struct mount *mp; + + mp = ITOV(ip)->v_mount; + printf("%s: bad dir ino %llu at offset %d: %s\n", + mp->mnt_stat.f_mntonname, (unsigned long long)ip->i_number, + offset, how); + if ((mp->mnt_stat.f_flag & MNT_RDONLY) == 0) + panic("bad dir"); +} + +/* + * Do consistency checking on a directory entry: + * record length must be multiple of 4 + * entry must fit in rest of its DIRBLKSIZ block + * record must be large enough to contain entry + * name is not longer than FFS_MAXNAMLEN + * name must be as long as advertised, and null terminated + */ +int +ufs_dirbadentry(struct vnode *dp, struct direct *ep, int entryoffsetinblock) +{ + int i; + int namlen; + struct ufsmount *ump = VFSTOUFS(dp->v_mount); + const int needswap = UFS_MPNEEDSWAP(ump); + int dirblksiz = ump->um_dirblksiz; + +#if (BYTE_ORDER == LITTLE_ENDIAN) + if (FSFMT(dp) && needswap == 0) + namlen = ep->d_type; + else + namlen = ep->d_namlen; +#else + if (FSFMT(dp) && needswap != 0) + namlen = ep->d_type; + else + namlen = ep->d_namlen; +#endif + if ((ufs_rw16(ep->d_reclen, needswap) & 0x3) != 0 || + ufs_rw16(ep->d_reclen, needswap) > + dirblksiz - (entryoffsetinblock & (dirblksiz - 1)) || + ufs_rw16(ep->d_reclen, needswap) < + DIRSIZ(FSFMT(dp), ep, needswap) || + namlen > FFS_MAXNAMLEN) { + /*return (1); */ + printf("First bad, reclen=%#x, DIRSIZ=%lu, namlen=%d, " + "flags=%#x, entryoffsetinblock=%d, dirblksiz = %d\n", + ufs_rw16(ep->d_reclen, needswap), + (u_long)DIRSIZ(FSFMT(dp), ep, needswap), + namlen, dp->v_mount->mnt_flag, entryoffsetinblock, + dirblksiz); + goto bad; + } + if (ep->d_ino == 0) + return (0); + for (i = 0; i < namlen; i++) + if (ep->d_name[i] == '\0') { + /*return (1); */ + printf("Second bad\n"); + goto bad; + } + if (ep->d_name[i]) + goto bad; + return (0); +bad: + return (1); +} + +/* + * Construct a new directory entry after a call to namei, using the + * name in the componentname argument cnp. The argument ip is the + * inode to which the new directory entry will refer. + */ +void +ufs_makedirentry(struct inode *ip, struct componentname *cnp, + struct direct *newdirp) +{ + newdirp->d_ino = ip->i_number; + newdirp->d_namlen = cnp->cn_namelen; + memcpy(newdirp->d_name, cnp->cn_nameptr, (size_t)cnp->cn_namelen); + newdirp->d_name[cnp->cn_namelen] = '\0'; + if (FSFMT(ITOV(ip))) + newdirp->d_type = 0; + else + newdirp->d_type = IFTODT(ip->i_mode); +} + +/* + * Write a directory entry after a call to namei, using the parameters + * that ufs_lookup left in nameidata and in the ufs_lookup_results. + * + * DVP is the directory to be updated. It must be locked. + * ULR is the ufs_lookup_results structure from the final lookup step. + * TVP is not used. (XXX: why is it here? remove it) + * DIRP is the new directory entry contents. + * CNP is the componentname from the final lookup step. + * NEWDIRBP is not used and (XXX) should be removed. The previous + * comment here said it was used by the now-removed softupdates code. + * + * The link count of the target inode is *not* incremented; the + * caller does that. + * + * If ulr->ulr_count is 0, ufs_lookup did not find space to insert the + * directory entry. ulr_offset, which is the place to put the entry, + * should be on a block boundary (and should be at the end of the + * directory AFAIK) and a fresh block is allocated to put the new + * directory entry in. + * + * If ulr->ulr_count is not zero, ufs_lookup found a slot to insert + * the entry into. This slot ranges from ulr_offset to ulr_offset + + * ulr_count. However, this slot may already be partially populated + * requiring compaction. See notes below. + * + * Furthermore, if ulr_count is not zero and ulr_endoff is not the + * same as i_size, the directory is truncated to size ulr_endoff. + */ +int +ufs_direnter(struct vnode *dvp, const struct ufs_lookup_results *ulr, + struct vnode *tvp, struct direct *dirp, + struct componentname *cnp, struct buf *newdirbp) +{ + kauth_cred_t cr; + struct lwp *l; + int newentrysize; + struct inode *dp; + struct buf *bp; + u_int dsize; + struct direct *ep, *nep; + int error, ret, blkoff, loc, spacefree; + char *dirbuf; + struct timespec ts; + struct ufsmount *ump = VFSTOUFS(dvp->v_mount); + const int needswap = UFS_MPNEEDSWAP(ump); + int dirblksiz = ump->um_dirblksiz; + + UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount); + + error = 0; + cr = cnp->cn_cred; + l = curlwp; + + dp = VTOI(dvp); + newentrysize = DIRSIZ(0, dirp, 0); + +#if 0 + struct ufs_lookup_results *ulr; + /* XXX should handle this material another way */ + ulr = &dp->i_crap; + UFS_CHECK_CRAPCOUNTER(dp); +#endif + + if (ulr->ulr_count == 0) { + /* + * If ulr_count is 0, then namei could find no + * space in the directory. Here, ulr_offset will + * be on a directory block boundary and we will write the + * new entry into a fresh block. + */ + if (ulr->ulr_offset & (dirblksiz - 1)) + panic("ufs_direnter: newblk"); + if ((error = UFS_BALLOC(dvp, (off_t)ulr->ulr_offset, dirblksiz, + cr, B_CLRBUF | B_SYNC, &bp)) != 0) { + return (error); + } + dp->i_size = ulr->ulr_offset + dirblksiz; + DIP_ASSIGN(dp, size, dp->i_size); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + uvm_vnp_setsize(dvp, dp->i_size); + dirp->d_reclen = ufs_rw16(dirblksiz, needswap); + dirp->d_ino = ufs_rw32(dirp->d_ino, needswap); + if (FSFMT(dvp)) { +#if (BYTE_ORDER == LITTLE_ENDIAN) + if (needswap == 0) { +#else + if (needswap != 0) { +#endif + u_char tmp = dirp->d_namlen; + dirp->d_namlen = dirp->d_type; + dirp->d_type = tmp; + } + } + blkoff = ulr->ulr_offset & (ump->um_mountp->mnt_stat.f_iosize - 1); + memcpy((char *)bp->b_data + blkoff, dirp, newentrysize); +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL) { + ufsdirhash_newblk(dp, ulr->ulr_offset); + ufsdirhash_add(dp, dirp, ulr->ulr_offset); + ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff, + ulr->ulr_offset); + } +#endif + error = VOP_BWRITE(bp->b_vp, bp); + vfs_timestamp(&ts); + ret = UFS_UPDATE(dvp, &ts, &ts, UPDATE_DIROP); + if (error == 0) + return (ret); + return (error); + } + + /* + * If ulr_count is non-zero, then namei found space for the new + * entry in the range ulr_offset to url_offset + url_count + * in the directory. To use this space, we may have to compact + * the entries located there, by copying them together towards the + * beginning of the block, leaving the free space in one usable + * chunk at the end. + */ + + /* + * Increase size of directory if entry eats into new space. + * This should never push the size past a new multiple of + * DIRBLKSIZ. + * + * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. + */ + if (ulr->ulr_offset + ulr->ulr_count > dp->i_size) { +#ifdef DIAGNOSTIC + printf("ufs_direnter: reached 4.2-only block, " + "not supposed to happen\n"); +#endif + dp->i_size = ulr->ulr_offset + ulr->ulr_count; + DIP_ASSIGN(dp, size, dp->i_size); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP); + } + /* + * Get the block containing the space for the new directory entry. + */ + error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, &dirbuf, &bp, true); + if (error) { + return (error); + } + /* + * Find space for the new entry. In the simple case, the entry at + * offset base will have the space. If it does not, then namei + * arranged that compacting the region dp->i_offset to + * dp->i_offset + dp->i_count would yield the space. + */ + ep = (struct direct *)dirbuf; + dsize = (ep->d_ino != 0) ? DIRSIZ(FSFMT(dvp), ep, needswap) : 0; + spacefree = ufs_rw16(ep->d_reclen, needswap) - dsize; + for (loc = ufs_rw16(ep->d_reclen, needswap); loc < ulr->ulr_count; ) { + uint16_t reclen; + + nep = (struct direct *)(dirbuf + loc); + + /* Trim the existing slot (NB: dsize may be zero). */ + ep->d_reclen = ufs_rw16(dsize, needswap); + ep = (struct direct *)((char *)ep + dsize); + + reclen = ufs_rw16(nep->d_reclen, needswap); + loc += reclen; + if (nep->d_ino == 0) { + /* + * A mid-block unused entry. Such entries are + * never created by the kernel, but fsck_ffs + * can create them (and it doesn't fix them). + * + * Add up the free space, and initialise the + * relocated entry since we don't memcpy it. + */ + spacefree += reclen; + ep->d_ino = 0; + dsize = 0; + continue; + } + dsize = DIRSIZ(FSFMT(dvp), nep, needswap); + spacefree += reclen - dsize; +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL) + ufsdirhash_move(dp, nep, + ulr->ulr_offset + ((char *)nep - dirbuf), + ulr->ulr_offset + ((char *)ep - dirbuf)); +#endif + memcpy((void *)ep, (void *)nep, dsize); + } + /* + * Here, `ep' points to a directory entry containing `dsize' in-use + * bytes followed by `spacefree' unused bytes. If ep->d_ino == 0, + * then the entry is completely unused (dsize == 0). The value + * of ep->d_reclen is always indeterminate. + * + * Update the pointer fields in the previous entry (if any), + * copy in the new entry, and write out the block. + */ + if (ep->d_ino == 0 || + (ufs_rw32(ep->d_ino, needswap) == WINO && + memcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) { + if (spacefree + dsize < newentrysize) + panic("ufs_direnter: compact1"); + dirp->d_reclen = spacefree + dsize; + } else { + if (spacefree < newentrysize) + panic("ufs_direnter: compact2"); + dirp->d_reclen = spacefree; + ep->d_reclen = ufs_rw16(dsize, needswap); + ep = (struct direct *)((char *)ep + dsize); + } + dirp->d_reclen = ufs_rw16(dirp->d_reclen, needswap); + dirp->d_ino = ufs_rw32(dirp->d_ino, needswap); + if (FSFMT(dvp)) { +#if (BYTE_ORDER == LITTLE_ENDIAN) + if (needswap == 0) { +#else + if (needswap != 0) { +#endif + u_char tmp = dirp->d_namlen; + dirp->d_namlen = dirp->d_type; + dirp->d_type = tmp; + } + } +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL && (ep->d_ino == 0 || + dirp->d_reclen == spacefree)) + ufsdirhash_add(dp, dirp, ulr->ulr_offset + ((char *)ep - dirbuf)); +#endif + memcpy((void *)ep, (void *)dirp, (u_int)newentrysize); +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL) + ufsdirhash_checkblock(dp, dirbuf - + (ulr->ulr_offset & (dirblksiz - 1)), + ulr->ulr_offset & ~(dirblksiz - 1)); +#endif + error = VOP_BWRITE(bp->b_vp, bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * If all went well, and the directory can be shortened, proceed + * with the truncation. Note that we have to unlock the inode for + * the entry that we just entered, as the truncation may need to + * lock other inodes which can lead to deadlock if we also hold a + * lock on the newly entered node. + */ + if (error == 0 && ulr->ulr_endoff && ulr->ulr_endoff < dp->i_size) { +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL) + ufsdirhash_dirtrunc(dp, ulr->ulr_endoff); +#endif + (void) UFS_TRUNCATE(dvp, (off_t)ulr->ulr_endoff, IO_SYNC, cr); + } + UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP); + return (error); +} + +/* + * Remove a directory entry after a call to namei, using the + * parameters that ufs_lookup left in nameidata and in the + * ufs_lookup_results. + * + * DVP is the directory to be updated. It must be locked. + * ULR is the ufs_lookup_results structure from the final lookup step. + * IP, if not null, is the inode being unlinked. + * FLAGS may contain DOWHITEOUT. + * ISRMDIR is not used and (XXX) should be removed. + * + * If FLAGS contains DOWHITEOUT the entry is replaced with a whiteout + * instead of being cleared. + * + * ulr->ulr_offset contains the position of the directory entry + * to be removed. + * + * ulr->ulr_reclen contains the size of the directory entry to be + * removed. + * + * ulr->ulr_count contains the size of the *previous* directory + * entry. This allows finding it, for free space management. If + * ulr_count is 0, the target entry is at the beginning of the + * directory. (Does this ever happen? The first entry should be ".", + * which should only be removed at rmdir time. Does rmdir come here + * to clear out the "." and ".." entries? Perhaps, but I doubt it.) + * + * The space is marked free by adding it to the record length (not + * name length) of the preceding entry. If the first entry becomes + * free, it is marked free by setting the inode number to 0. + * + * The link count of IP is decremented. Note that this is not the + * inverse behavior of ufs_direnter, which does not adjust link + * counts. Sigh. + */ +int +ufs_dirremove(struct vnode *dvp, const struct ufs_lookup_results *ulr, + struct inode *ip, int flags, int isrmdir) +{ + struct inode *dp = VTOI(dvp); + struct direct *ep; + struct buf *bp; + int error; +#ifdef FFS_EI + const int needswap = UFS_MPNEEDSWAP(dp->i_ump); +#endif + + UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount); + + if (flags & DOWHITEOUT) { + /* + * Whiteout entry: set d_ino to WINO. + */ + error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, (void *)&ep, + &bp, true); + if (error) + return (error); + ep->d_ino = ufs_rw32(WINO, needswap); + ep->d_type = DT_WHT; + goto out; + } + + if ((error = ufs_blkatoff(dvp, + (off_t)(ulr->ulr_offset - ulr->ulr_count), (void *)&ep, &bp, true)) != 0) + return (error); + +#ifdef UFS_DIRHASH + /* + * Remove the dirhash entry. This is complicated by the fact + * that `ep' is the previous entry when dp->i_count != 0. + */ + if (dp->i_dirhash != NULL) + ufsdirhash_remove(dp, (ulr->ulr_count == 0) ? ep : + (struct direct *)((char *)ep + + ufs_rw16(ep->d_reclen, needswap)), ulr->ulr_offset); +#endif + + if (ulr->ulr_count == 0) { + /* + * First entry in block: set d_ino to zero. + */ + ep->d_ino = 0; + } else { + /* + * Collapse new free space into previous entry. + */ + ep->d_reclen = + ufs_rw16(ufs_rw16(ep->d_reclen, needswap) + ulr->ulr_reclen, + needswap); + } + +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL) { + int dirblksiz = ip->i_ump->um_dirblksiz; + ufsdirhash_checkblock(dp, (char *)ep - + ((ulr->ulr_offset - ulr->ulr_count) & (dirblksiz - 1)), + ulr->ulr_offset & ~(dirblksiz - 1)); + } +#endif + +out: + if (ip) { + ip->i_nlink--; + DIP_ASSIGN(ip, nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(ITOV(ip), NULL, NULL, 0); + } + error = VOP_BWRITE(bp->b_vp, bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * If the last named reference to a snapshot goes away, + * drop its snapshot reference so that it will be reclaimed + * when last open reference goes away. + */ + if (ip != 0 && (ip->i_flags & SF_SNAPSHOT) != 0 && + ip->i_nlink == 0) + ffs_snapgone(ip); + UFS_WAPBL_UPDATE(dvp, NULL, NULL, 0); + return (error); +} + +/* + * Rewrite an existing directory entry to point at the inode supplied. + * + * DP is the directory to update. + * OFFSET is the position of the entry in question. It may come + * from ulr_offset of a ufs_lookup_results. + * OIP is the old inode the directory previously pointed to. + * NEWINUM is the number of the new inode. + * NEWTYPE is the new value for the type field of the directory entry. + * (This is ignored if the fs doesn't support that.) + * ISRMDIR is not used and (XXX) should be removed. + * IFLAGS are added to DP's inode flags. + * + * The link count of OIP is decremented. Note that the link count of + * the new inode is *not* incremented. Yay for symmetry. + */ +int +ufs_dirrewrite(struct inode *dp, off_t offset, + struct inode *oip, ino_t newinum, int newtype, + int isrmdir, int iflags) +{ + struct buf *bp; + struct direct *ep; + struct vnode *vdp = ITOV(dp); + int error; + + error = ufs_blkatoff(vdp, offset, (void *)&ep, &bp, true); + if (error) + return (error); + ep->d_ino = ufs_rw32(newinum, UFS_MPNEEDSWAP(dp->i_ump)); + if (!FSFMT(vdp)) + ep->d_type = newtype; + oip->i_nlink--; + DIP_ASSIGN(oip, nlink, oip->i_nlink); + oip->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(ITOV(oip), NULL, NULL, UPDATE_DIROP); + error = VOP_BWRITE(bp->b_vp, bp); + dp->i_flag |= iflags; + /* + * If the last named reference to a snapshot goes away, + * drop its snapshot reference so that it will be reclaimed + * when last open reference goes away. + */ + if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_nlink == 0) + ffs_snapgone(oip); + UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP); + return (error); +} + +/* + * Check if a directory is empty or not. + * Inode supplied must be locked. + * + * Using a struct dirtemplate here is not precisely + * what we want, but better than using a struct direct. + * + * NB: does not handle corrupted directories. + */ +int +ufs_dirempty(struct inode *ip, ino_t parentino, kauth_cred_t cred) +{ + doff_t off; + struct dirtemplate dbuf; + struct direct *dp = (struct direct *)&dbuf; + int error, namlen; + size_t count; + const int needswap = UFS_IPNEEDSWAP(ip); +#define MINDIRSIZ (sizeof (struct dirtemplate) / 2) + + for (off = 0; off < ip->i_size; + off += ufs_rw16(dp->d_reclen, needswap)) { + error = vn_rdwr(UIO_READ, ITOV(ip), (void *)dp, MINDIRSIZ, off, + UIO_SYSSPACE, IO_NODELOCKED, cred, &count, NULL); + /* + * Since we read MINDIRSIZ, residual must + * be 0 unless we're at end of file. + */ + if (error || count != 0) + return (0); + /* avoid infinite loops */ + if (dp->d_reclen == 0) + return (0); + /* skip empty entries */ + if (dp->d_ino == 0 || ufs_rw32(dp->d_ino, needswap) == WINO) + continue; + /* accept only "." and ".." */ +#if (BYTE_ORDER == LITTLE_ENDIAN) + if (FSFMT(ITOV(ip)) && needswap == 0) + namlen = dp->d_type; + else + namlen = dp->d_namlen; +#else + if (FSFMT(ITOV(ip)) && needswap != 0) + namlen = dp->d_type; + else + namlen = dp->d_namlen; +#endif + if (namlen > 2) + return (0); + if (dp->d_name[0] != '.') + return (0); + /* + * At this point namlen must be 1 or 2. + * 1 implies ".", 2 implies ".." if second + * char is also "." + */ + if (namlen == 1 && + ufs_rw32(dp->d_ino, needswap) == ip->i_number) + continue; + if (dp->d_name[1] == '.' && + ufs_rw32(dp->d_ino, needswap) == parentino) + continue; + return (0); + } + return (1); +} + +/* + * Check if source directory is in the path of the target directory. + * Target is supplied locked, source is unlocked. + * The target is always vput before returning. + */ +int +ufs_checkpath(struct inode *source, struct inode *target, kauth_cred_t cred) +{ + struct vnode *nextvp, *vp; + int error, rootino, namlen; + struct dirtemplate dirbuf; + const int needswap = UFS_MPNEEDSWAP(target->i_ump); + + vp = ITOV(target); + if (target->i_number == source->i_number) { + error = EEXIST; + goto out; + } + rootino = ROOTINO; + error = 0; + if (target->i_number == rootino) + goto out; + + for (;;) { + if (vp->v_type != VDIR) { + error = ENOTDIR; + break; + } + error = vn_rdwr(UIO_READ, vp, (void *)&dirbuf, + sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED, cred, NULL, NULL); + if (error != 0) + break; +#if (BYTE_ORDER == LITTLE_ENDIAN) + if (FSFMT(vp) && needswap == 0) + namlen = dirbuf.dotdot_type; + else + namlen = dirbuf.dotdot_namlen; +#else + if (FSFMT(vp) && needswap != 0) + namlen = dirbuf.dotdot_type; + else + namlen = dirbuf.dotdot_namlen; +#endif + if (namlen != 2 || + dirbuf.dotdot_name[0] != '.' || + dirbuf.dotdot_name[1] != '.') { + error = ENOTDIR; + break; + } + if (ufs_rw32(dirbuf.dotdot_ino, needswap) == source->i_number) { + error = EINVAL; + break; + } + if (ufs_rw32(dirbuf.dotdot_ino, needswap) == rootino) + break; + VOP_UNLOCK(vp); + error = VFS_VGET(vp->v_mount, + ufs_rw32(dirbuf.dotdot_ino, needswap), &nextvp); + vrele(vp); + if (error) { + vp = NULL; + break; + } + vp = nextvp; + } + +out: + if (error == ENOTDIR) + printf("checkpath: .. not a directory\n"); + if (vp != NULL) + vput(vp); + return (error); +} + +/* + * Extract the inode number of ".." from a directory. + * Helper for ufs_parentcheck. + */ +static int +ufs_readdotdot(struct vnode *vp, int needswap, kauth_cred_t cred, ino_t *result) +{ + struct dirtemplate dirbuf; + int namlen, error; + + error = vn_rdwr(UIO_READ, vp, &dirbuf, + sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED, cred, NULL, NULL); + if (error) { + return error; + } + +#if (BYTE_ORDER == LITTLE_ENDIAN) + if (FSFMT(vp) && needswap == 0) + namlen = dirbuf.dotdot_type; + else + namlen = dirbuf.dotdot_namlen; +#else + if (FSFMT(vp) && needswap != 0) + namlen = dirbuf.dotdot_type; + else + namlen = dirbuf.dotdot_namlen; +#endif + if (namlen != 2 || + dirbuf.dotdot_name[0] != '.' || + dirbuf.dotdot_name[1] != '.') { + printf("ufs_readdotdot: directory %llu contains " + "garbage instead of ..\n", + (unsigned long long) VTOI(vp)->i_number); + return ENOTDIR; + } + *result = ufs_rw32(dirbuf.dotdot_ino, needswap); + return 0; +} + +/* + * Check if LOWER is a descendent of UPPER. If we find UPPER, return + * nonzero in FOUND and return a reference to the immediate descendent + * of UPPER in UPPERCHILD. If we don't find UPPER (that is, if we + * reach the volume root and that isn't UPPER), return zero in FOUND + * and null in UPPERCHILD. + * + * Neither UPPER nor LOWER should be locked. + * + * On error (such as a permissions error checking up the directory + * tree) fail entirely. + * + * Note that UPPER and LOWER must be on the same volume, and because + * we inspect only that volume NEEDSWAP can be constant. + */ +int +ufs_parentcheck(struct vnode *upper, struct vnode *lower, kauth_cred_t cred, + int *found_ret, struct vnode **upperchild_ret) +{ + const int needswap = UFS_MPNEEDSWAP(VTOI(lower)->i_ump); + ino_t upper_ino, found_ino; + struct vnode *current, *next; + int error; + + if (upper == lower) { + vref(upper); + *found_ret = 1; + *upperchild_ret = upper; + return 0; + } + if (VTOI(lower)->i_number == ROOTINO) { + *found_ret = 0; + *upperchild_ret = NULL; + return 0; + } + + upper_ino = VTOI(upper)->i_number; + + current = lower; + vref(current); + vn_lock(current, LK_EXCLUSIVE | LK_RETRY); + + for (;;) { + error = ufs_readdotdot(current, needswap, cred, &found_ino); + if (error) { + vput(current); + return error; + } + if (found_ino == upper_ino) { + VOP_UNLOCK(current); + *found_ret = 1; + *upperchild_ret = current; + return 0; + } + if (found_ino == ROOTINO) { + vput(current); + *found_ret = 0; + *upperchild_ret = NULL; + return 0; + } + VOP_UNLOCK(current); + error = VFS_VGET(current->v_mount, found_ino, &next); + if (error) { + vrele(current); + return error; + } + KASSERT(VOP_ISLOCKED(next)); + if (next->v_type != VDIR) { + printf("ufs_parentcheck: inode %llu reached via .. of " + "inode %llu is not a directory\n", + (unsigned long long)VTOI(next)->i_number, + (unsigned long long)VTOI(current)->i_number); + vput(next); + vrele(current); + return ENOTDIR; + } + vrele(current); + current = next; + } + + return 0; +} + +#define UFS_DIRRABLKS 0 +int ufs_dirrablks = UFS_DIRRABLKS; + +/* + * ufs_blkatoff: Return buffer with the contents of block "offset" from + * the beginning of directory "vp". If "res" is non-zero, fill it in with + * a pointer to the remaining space in the directory. If the caller intends + * to modify the buffer returned, "modify" must be true. + */ + +int +ufs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp, + bool modify) +{ + struct inode *ip; + struct buf *bp; + daddr_t lbn; + const int dirrablks = ufs_dirrablks; + daddr_t *blks; + int *blksizes; + int run, error; + struct mount *mp = vp->v_mount; + const int bshift = mp->mnt_fs_bshift; + const int bsize = 1 << bshift; + off_t eof; + + blks = kmem_alloc((1 + dirrablks) * sizeof(daddr_t), KM_SLEEP); + blksizes = kmem_alloc((1 + dirrablks) * sizeof(int), KM_SLEEP); + ip = VTOI(vp); + KASSERT(vp->v_size == ip->i_size); + GOP_SIZE(vp, vp->v_size, &eof, 0); + lbn = offset >> bshift; + + for (run = 0; run <= dirrablks;) { + const off_t curoff = lbn << bshift; + const int size = MIN(eof - curoff, bsize); + + if (size == 0) { + break; + } + KASSERT(curoff < eof); + blks[run] = lbn; + blksizes[run] = size; + lbn++; + run++; + if (size != bsize) { + break; + } + } + KASSERT(run >= 1); + error = breadn(vp, blks[0], blksizes[0], &blks[1], &blksizes[1], + run - 1, NOCRED, (modify ? B_MODIFY : 0), &bp); + if (error != 0) { + brelse(bp, 0); + *bpp = NULL; + goto out; + } + if (res) { + *res = (char *)bp->b_data + (offset & (bsize - 1)); + } + *bpp = bp; + + out: + kmem_free(blks, (1 + dirrablks) * sizeof(daddr_t)); + kmem_free(blksizes, (1 + dirrablks) * sizeof(int)); + return error; +} diff --git a/sys/ufs/ufs/ufs_quota.c b/sys/ufs/ufs/ufs_quota.c new file mode 100644 index 000000000..78cef57e1 --- /dev/null +++ b/sys/ufs/ufs/ufs_quota.c @@ -0,0 +1,877 @@ +/* $NetBSD: ufs_quota.c,v 1.70 2011/03/24 17:05:46 bouyer Exp $ */ + +/* + * Copyright (c) 1982, 1986, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Robert Elz at The University of Melbourne. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ufs_quota.c,v 1.70 2011/03/24 17:05:46 bouyer Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_quota.h" +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +kmutex_t dqlock; +kcondvar_t dqcv; + +/* + * Code pertaining to management of the in-core dquot data structures. + */ +#define DQHASH(dqvp, id) \ + (((((long)(dqvp)) >> 8) + id) & dqhash) +static LIST_HEAD(dqhashhead, dquot) *dqhashtbl; +static u_long dqhash; +static pool_cache_t dquot_cache; + + +static int quota_handle_cmd_get_version(struct mount *, struct lwp *, + prop_dictionary_t, prop_array_t); +static int quota_handle_cmd_get(struct mount *, struct lwp *, + prop_dictionary_t, int, prop_array_t); +static int quota_handle_cmd_set(struct mount *, struct lwp *, + prop_dictionary_t, int, prop_array_t); +static int quota_handle_cmd_getall(struct mount *, struct lwp *, + prop_dictionary_t, int, prop_array_t); +static int quota_handle_cmd_clear(struct mount *, struct lwp *, + prop_dictionary_t, int, prop_array_t); +static int quota_handle_cmd_quotaon(struct mount *, struct lwp *, + prop_dictionary_t, int, prop_array_t); +static int quota_handle_cmd_quotaoff(struct mount *, struct lwp *, + prop_dictionary_t, int, prop_array_t); +/* + * Initialize the quota fields of an inode. + */ +void +ufsquota_init(struct inode *ip) +{ + int i; + + for (i = 0; i < MAXQUOTAS; i++) + ip->i_dquot[i] = NODQUOT; +} + +/* + * Release the quota fields from an inode. + */ +void +ufsquota_free(struct inode *ip) +{ + int i; + + for (i = 0; i < MAXQUOTAS; i++) { + dqrele(ITOV(ip), ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } +} + +/* + * Update disk usage, and take corrective action. + */ +int +chkdq(struct inode *ip, int64_t change, kauth_cred_t cred, int flags) +{ + /* do not track snapshot usage, or we will deadlock */ + if ((ip->i_flags & SF_SNAPSHOT) != 0) + return 0; + +#ifdef QUOTA + if (ip->i_ump->um_flags & UFS_QUOTA) + return chkdq1(ip, change, cred, flags); +#endif +#ifdef QUOTA2 + if (ip->i_ump->um_flags & UFS_QUOTA2) + return chkdq2(ip, change, cred, flags); +#endif + return 0; +} + +/* + * Check the inode limit, applying corrective action. + */ +int +chkiq(struct inode *ip, int32_t change, kauth_cred_t cred, int flags) +{ + /* do not track snapshot usage, or we will deadlock */ + if ((ip->i_flags & SF_SNAPSHOT) != 0) + return 0; +#ifdef QUOTA + if (ip->i_ump->um_flags & UFS_QUOTA) + return chkiq1(ip, change, cred, flags); +#endif +#ifdef QUOTA2 + if (ip->i_ump->um_flags & UFS_QUOTA2) + return chkiq2(ip, change, cred, flags); +#endif + return 0; +} + +int +quota_handle_cmd(struct mount *mp, struct lwp *l, prop_dictionary_t cmddict) +{ + int error = 0; + const char *cmd, *type; + prop_array_t datas; + int q2type; + + if (!prop_dictionary_get_cstring_nocopy(cmddict, "command", &cmd)) + return EINVAL; + if (!prop_dictionary_get_cstring_nocopy(cmddict, "type", &type)) + return EINVAL; + if (!strcmp(type, QUOTADICT_CLASS_USER)) { + q2type = USRQUOTA; + } else if (!strcmp(type, QUOTADICT_CLASS_GROUP)) { + q2type = GRPQUOTA; + } else + return EOPNOTSUPP; + datas = prop_dictionary_get(cmddict, "data"); + if (datas == NULL || prop_object_type(datas) != PROP_TYPE_ARRAY) + return EINVAL; + + prop_object_retain(datas); + prop_dictionary_remove(cmddict, "data"); /* prepare for return */ + + if (strcmp(cmd, "get version") == 0) { + error = quota_handle_cmd_get_version(mp, l, cmddict, datas); + goto end; + } + if (strcmp(cmd, "quotaon") == 0) { + error = quota_handle_cmd_quotaon(mp, l, cmddict, + q2type, datas); + goto end; + } + if (strcmp(cmd, "quotaoff") == 0) { + error = quota_handle_cmd_quotaoff(mp, l, cmddict, + q2type, datas); + goto end; + } + if (strcmp(cmd, "get") == 0) { + error = quota_handle_cmd_get(mp, l, cmddict, q2type, datas); + goto end; + } + if (strcmp(cmd, "set") == 0) { + error = quota_handle_cmd_set(mp, l, cmddict, q2type, datas); + goto end; + } + if (strcmp(cmd, "getall") == 0) { + error = quota_handle_cmd_getall(mp, l, cmddict, q2type, datas); + goto end; + } + if (strcmp(cmd, "clear") == 0) { + error = quota_handle_cmd_clear(mp, l, cmddict, q2type, datas); + goto end; + } + error = EOPNOTSUPP; +end: + error = (prop_dictionary_set_int8(cmddict, "return", + error) ? 0 : ENOMEM); + prop_object_release(datas); + return error; +} + +static int +quota_handle_cmd_get_version(struct mount *mp, struct lwp *l, + prop_dictionary_t cmddict, prop_array_t datas) +{ + struct ufsmount *ump = VFSTOUFS(mp); + prop_array_t replies; + prop_dictionary_t data; + int error = 0; + + if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0) + return EOPNOTSUPP; + + replies = prop_array_create(); + if (replies == NULL) + return ENOMEM; + + data = prop_dictionary_create(); + if (data == NULL) { + prop_object_release(replies); + return ENOMEM; + } + +#ifdef QUOTA + if (ump->um_flags & UFS_QUOTA) { + if (!prop_dictionary_set_int8(data, "version", 1)) + error = ENOMEM; + } else +#endif +#ifdef QUOTA2 + if (ump->um_flags & UFS_QUOTA2) { + if (!prop_dictionary_set_int8(data, "version", 2)) + error = ENOMEM; + } else +#endif + error = 0; + if (error) + prop_object_release(data); + else if (!prop_array_add_and_rel(replies, data)) + error = ENOMEM; + if (error) + prop_object_release(replies); + else if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) + error = ENOMEM; + return error; +} + +/* XXX shouldn't all this be in kauth ? */ +static int +quota_get_auth(struct mount *mp, struct lwp *l, uid_t id) { + /* The user can always query about his own quota. */ + if (id == kauth_cred_getuid(l->l_cred)) + return 0; + return kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, + KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, KAUTH_ARG(id), NULL); +} + +static int +quota_handle_cmd_get(struct mount *mp, struct lwp *l, + prop_dictionary_t cmddict, int type, prop_array_t datas) +{ + prop_array_t replies; + prop_object_iterator_t iter; + prop_dictionary_t data; + uint32_t id; + struct ufsmount *ump = VFSTOUFS(mp); + int error, defaultq = 0; + const char *idstr; + + if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0) + return EOPNOTSUPP; + + replies = prop_array_create(); + if (replies == NULL) + return ENOMEM; + + iter = prop_array_iterator(datas); + if (iter == NULL) { + prop_object_release(replies); + return ENOMEM; + } + while ((data = prop_object_iterator_next(iter)) != NULL) { + if (!prop_dictionary_get_uint32(data, "id", &id)) { + if (!prop_dictionary_get_cstring_nocopy(data, "id", + &idstr)) + continue; + if (strcmp(idstr, "default")) { + error = EINVAL; + goto err; + } + id = 0; + defaultq = 1; + } else { + defaultq = 0; + } + error = quota_get_auth(mp, l, id); + if (error == EPERM) + continue; + if (error != 0) + goto err; +#ifdef QUOTA + if (ump->um_flags & UFS_QUOTA) + error = quota1_handle_cmd_get(ump, type, id, defaultq, + replies); + else +#endif +#ifdef QUOTA2 + if (ump->um_flags & UFS_QUOTA2) { + error = quota2_handle_cmd_get(ump, type, id, defaultq, + replies); + } else +#endif + panic("quota_handle_cmd_get: no support ?"); + + if (error == ENOENT) + continue; + if (error != 0) + goto err; + } + prop_object_iterator_release(iter); + if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) { + error = ENOMEM; + } else { + error = 0; + } + return error; +err: + prop_object_iterator_release(iter); + prop_object_release(replies); + return error; +} + +static int +quota_handle_cmd_set(struct mount *mp, struct lwp *l, + prop_dictionary_t cmddict, int type, prop_array_t datas) +{ + prop_array_t replies; + prop_object_iterator_t iter; + prop_dictionary_t data; + uint32_t id; + struct ufsmount *ump = VFSTOUFS(mp); + int error, defaultq = 0; + const char *idstr; + + if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0) + return EOPNOTSUPP; + + replies = prop_array_create(); + if (replies == NULL) + return ENOMEM; + + iter = prop_array_iterator(datas); + if (iter == NULL) { + prop_object_release(replies); + return ENOMEM; + } + while ((data = prop_object_iterator_next(iter)) != NULL) { + if (!prop_dictionary_get_uint32(data, "id", &id)) { + if (!prop_dictionary_get_cstring_nocopy(data, "id", + &idstr)) + continue; + if (strcmp(idstr, "default")) + continue; + id = 0; + defaultq = 1; + } else { + defaultq = 0; + } + error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, + KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(id), NULL); + if (error != 0) + goto err; +#ifdef QUOTA + if (ump->um_flags & UFS_QUOTA) + error = quota1_handle_cmd_set(ump, type, id, defaultq, + data); + else +#endif +#ifdef QUOTA2 + if (ump->um_flags & UFS_QUOTA2) { + error = quota2_handle_cmd_set(ump, type, id, defaultq, + data); + } else +#endif + panic("quota_handle_cmd_get: no support ?"); + + if (error && error != ENOENT) + goto err; + } + prop_object_iterator_release(iter); + if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) { + error = ENOMEM; + } else { + error = 0; + } + return error; +err: + prop_object_iterator_release(iter); + prop_object_release(replies); + return error; +} + +static int +quota_handle_cmd_clear(struct mount *mp, struct lwp *l, + prop_dictionary_t cmddict, int type, prop_array_t datas) +{ + prop_array_t replies; + prop_object_iterator_t iter; + prop_dictionary_t data; + uint32_t id; + struct ufsmount *ump = VFSTOUFS(mp); + int error, defaultq = 0; + const char *idstr; + + if ((ump->um_flags & UFS_QUOTA2) == 0) + return EOPNOTSUPP; + + replies = prop_array_create(); + if (replies == NULL) + return ENOMEM; + + iter = prop_array_iterator(datas); + if (iter == NULL) { + prop_object_release(replies); + return ENOMEM; + } + while ((data = prop_object_iterator_next(iter)) != NULL) { + if (!prop_dictionary_get_uint32(data, "id", &id)) { + if (!prop_dictionary_get_cstring_nocopy(data, "id", + &idstr)) + continue; + if (strcmp(idstr, "default")) + continue; + id = 0; + defaultq = 1; + } else { + defaultq = 0; + } + error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, + KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(id), NULL); + if (error != 0) + goto err; +#ifdef QUOTA2 + if (ump->um_flags & UFS_QUOTA2) { + error = quota2_handle_cmd_clear(ump, type, id, defaultq, + data); + } else +#endif + panic("quota_handle_cmd_get: no support ?"); + + if (error && error != ENOENT) + goto err; + } + prop_object_iterator_release(iter); + if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) { + error = ENOMEM; + } else { + error = 0; + } + return error; +err: + prop_object_iterator_release(iter); + prop_object_release(replies); + return error; +} + +static int +quota_handle_cmd_getall(struct mount *mp, struct lwp *l, + prop_dictionary_t cmddict, int type, prop_array_t datas) +{ + prop_array_t replies; + struct ufsmount *ump = VFSTOUFS(mp); + int error; + + if ((ump->um_flags & UFS_QUOTA2) == 0) + return EOPNOTSUPP; + + error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, + KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, NULL, NULL); + if (error) + return error; + + replies = prop_array_create(); + if (replies == NULL) + return ENOMEM; + +#ifdef QUOTA2 + if (ump->um_flags & UFS_QUOTA2) { + error = quota2_handle_cmd_getall(ump, type, replies); + } else +#endif + panic("quota_handle_cmd_getall: no support ?"); + if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) { + error = ENOMEM; + } else { + error = 0; + } + return error; +} + +static int +quota_handle_cmd_quotaon(struct mount *mp, struct lwp *l, + prop_dictionary_t cmddict, int type, prop_array_t datas) +{ + prop_dictionary_t data; + struct ufsmount *ump = VFSTOUFS(mp); + int error; + const char *qfile; + + if ((ump->um_flags & UFS_QUOTA2) != 0) + return EBUSY; + + if (prop_array_count(datas) != 1) + return EINVAL; + + data = prop_array_get(datas, 0); + if (data == NULL) + return ENOMEM; + if (!prop_dictionary_get_cstring_nocopy(data, "quotafile", + &qfile)) + return EINVAL; + + error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, + KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL); + if (error != 0) { + return error; + } +#ifdef QUOTA + error = quota1_handle_cmd_quotaon(l, ump, type, qfile); +#else + error = EOPNOTSUPP; +#endif + + return error; +} + +static int +quota_handle_cmd_quotaoff(struct mount *mp, struct lwp *l, + prop_dictionary_t cmddict, int type, prop_array_t datas) +{ + struct ufsmount *ump = VFSTOUFS(mp); + int error; + + if ((ump->um_flags & UFS_QUOTA2) != 0) + return EOPNOTSUPP; + + if (prop_array_count(datas) != 0) + return EINVAL; + + error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, + KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL); + if (error != 0) { + return error; + } +#ifdef QUOTA + error = quota1_handle_cmd_quotaoff(l, ump, type); +#else + error = EOPNOTSUPP; +#endif + + return error; +} + +/* + * Initialize the quota system. + */ +void +dqinit(void) +{ + + mutex_init(&dqlock, MUTEX_DEFAULT, IPL_NONE); + cv_init(&dqcv, "quota"); + dqhashtbl = hashinit(desiredvnodes, HASH_LIST, true, &dqhash); + dquot_cache = pool_cache_init(sizeof(struct dquot), 0, 0, 0, "ufsdq", + NULL, IPL_NONE, NULL, NULL, NULL); +} + +void +dqreinit(void) +{ + struct dquot *dq; + struct dqhashhead *oldhash, *hash; + struct vnode *dqvp; + u_long oldmask, mask, hashval; + int i; + + hash = hashinit(desiredvnodes, HASH_LIST, true, &mask); + mutex_enter(&dqlock); + oldhash = dqhashtbl; + oldmask = dqhash; + dqhashtbl = hash; + dqhash = mask; + for (i = 0; i <= oldmask; i++) { + while ((dq = LIST_FIRST(&oldhash[i])) != NULL) { + dqvp = dq->dq_ump->um_quotas[dq->dq_type]; + LIST_REMOVE(dq, dq_hash); + hashval = DQHASH(dqvp, dq->dq_id); + LIST_INSERT_HEAD(&dqhashtbl[hashval], dq, dq_hash); + } + } + mutex_exit(&dqlock); + hashdone(oldhash, HASH_LIST, oldmask); +} + +/* + * Free resources held by quota system. + */ +void +dqdone(void) +{ + + pool_cache_destroy(dquot_cache); + hashdone(dqhashtbl, HASH_LIST, dqhash); + cv_destroy(&dqcv); + mutex_destroy(&dqlock); +} + +/* + * Set up the quotas for an inode. + * + * This routine completely defines the semantics of quotas. + * If other criterion want to be used to establish quotas, the + * MAXQUOTAS value in quotas.h should be increased, and the + * additional dquots set up here. + */ +int +getinoquota(struct inode *ip) +{ + struct ufsmount *ump = ip->i_ump; + struct vnode *vp = ITOV(ip); + int i, error; + u_int32_t ino_ids[MAXQUOTAS]; + + /* + * To avoid deadlocks never update quotas for quota files + * on the same file system + */ + for (i = 0; i < MAXQUOTAS; i++) + if (vp == ump->um_quotas[i]) + return 0; + + ino_ids[USRQUOTA] = ip->i_uid; + ino_ids[GRPQUOTA] = ip->i_gid; + for (i = 0; i < MAXQUOTAS; i++) { + /* + * If the file id changed the quota needs update. + */ + if (ip->i_dquot[i] != NODQUOT && + ip->i_dquot[i]->dq_id != ino_ids[i]) { + dqrele(ITOV(ip), ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } + /* + * Set up the quota based on file id. + * ENODEV means that quotas are not enabled. + */ + if (ip->i_dquot[i] == NODQUOT && + (error = dqget(vp, ino_ids[i], ump, i, &ip->i_dquot[i])) && + error != ENODEV) + return (error); + } + return 0; +} + +/* + * Obtain a dquot structure for the specified identifier and quota file + * reading the information from the file if necessary. + */ +int +dqget(struct vnode *vp, u_long id, struct ufsmount *ump, int type, + struct dquot **dqp) +{ + struct dquot *dq, *ndq; + struct dqhashhead *dqh; + struct vnode *dqvp; + int error = 0; /* XXX gcc */ + + /* Lock to see an up to date value for QTF_CLOSING. */ + mutex_enter(&dqlock); + if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0) { + mutex_exit(&dqlock); + *dqp = NODQUOT; + return (ENODEV); + } + dqvp = ump->um_quotas[type]; +#ifdef QUOTA + if (ump->um_flags & UFS_QUOTA) { + if (dqvp == NULLVP || (ump->umq1_qflags[type] & QTF_CLOSING)) { + mutex_exit(&dqlock); + *dqp = NODQUOT; + return (ENODEV); + } + } +#endif +#ifdef QUOTA2 + if (ump->um_flags & UFS_QUOTA2) { + if (dqvp == NULLVP) { + mutex_exit(&dqlock); + *dqp = NODQUOT; + return (ENODEV); + } + } +#endif + KASSERT(dqvp != vp); + /* + * Check the cache first. + */ + dqh = &dqhashtbl[DQHASH(dqvp, id)]; + LIST_FOREACH(dq, dqh, dq_hash) { + if (dq->dq_id != id || + dq->dq_ump->um_quotas[dq->dq_type] != dqvp) + continue; + KASSERT(dq->dq_cnt > 0); + dqref(dq); + mutex_exit(&dqlock); + *dqp = dq; + return (0); + } + /* + * Not in cache, allocate a new one. + */ + mutex_exit(&dqlock); + ndq = pool_cache_get(dquot_cache, PR_WAITOK); + /* + * Initialize the contents of the dquot structure. + */ + memset((char *)ndq, 0, sizeof *ndq); + ndq->dq_flags = 0; + ndq->dq_id = id; + ndq->dq_ump = ump; + ndq->dq_type = type; + mutex_init(&ndq->dq_interlock, MUTEX_DEFAULT, IPL_NONE); + mutex_enter(&dqlock); + dqh = &dqhashtbl[DQHASH(dqvp, id)]; + LIST_FOREACH(dq, dqh, dq_hash) { + if (dq->dq_id != id || + dq->dq_ump->um_quotas[dq->dq_type] != dqvp) + continue; + /* + * Another thread beat us allocating this dquot. + */ + KASSERT(dq->dq_cnt > 0); + dqref(dq); + mutex_exit(&dqlock); + mutex_destroy(&ndq->dq_interlock); + pool_cache_put(dquot_cache, ndq); + *dqp = dq; + return 0; + } + dq = ndq; + LIST_INSERT_HEAD(dqh, dq, dq_hash); + dqref(dq); + mutex_enter(&dq->dq_interlock); + mutex_exit(&dqlock); +#ifdef QUOTA + if (ump->um_flags & UFS_QUOTA) + error = dq1get(dqvp, id, ump, type, dq); +#endif +#ifdef QUOTA2 + if (ump->um_flags & UFS_QUOTA2) + error = dq2get(dqvp, id, ump, type, dq); +#endif + /* + * I/O error in reading quota file, release + * quota structure and reflect problem to caller. + */ + if (error) { + mutex_enter(&dqlock); + LIST_REMOVE(dq, dq_hash); + mutex_exit(&dqlock); + mutex_exit(&dq->dq_interlock); + dqrele(vp, dq); + *dqp = NODQUOT; + return (error); + } + mutex_exit(&dq->dq_interlock); + *dqp = dq; + return (0); +} + +/* + * Obtain a reference to a dquot. + */ +void +dqref(struct dquot *dq) +{ + + KASSERT(mutex_owned(&dqlock)); + dq->dq_cnt++; + KASSERT(dq->dq_cnt > 0); +} + +/* + * Release a reference to a dquot. + */ +void +dqrele(struct vnode *vp, struct dquot *dq) +{ + + if (dq == NODQUOT) + return; + mutex_enter(&dq->dq_interlock); + for (;;) { + mutex_enter(&dqlock); + if (dq->dq_cnt > 1) { + dq->dq_cnt--; + mutex_exit(&dqlock); + mutex_exit(&dq->dq_interlock); + return; + } + if ((dq->dq_flags & DQ_MOD) == 0) + break; + mutex_exit(&dqlock); +#ifdef QUOTA + if (dq->dq_ump->um_flags & UFS_QUOTA) + (void) dq1sync(vp, dq); +#endif +#ifdef QUOTA2 + if (dq->dq_ump->um_flags & UFS_QUOTA2) + (void) dq2sync(vp, dq); +#endif + } + KASSERT(dq->dq_cnt == 1 && (dq->dq_flags & DQ_MOD) == 0); + LIST_REMOVE(dq, dq_hash); + mutex_exit(&dqlock); + mutex_exit(&dq->dq_interlock); + mutex_destroy(&dq->dq_interlock); + pool_cache_put(dquot_cache, dq); +} + +int +qsync(struct mount *mp) +{ + struct ufsmount *ump = VFSTOUFS(mp); +#ifdef QUOTA + if (ump->um_flags & UFS_QUOTA) + return q1sync(mp); +#endif +#ifdef QUOTA2 + if (ump->um_flags & UFS_QUOTA2) + return q2sync(mp); +#endif + return 0; +} + +#ifdef DIAGNOSTIC +/* + * Check the hash chains for stray dquot's. + */ +void +dqflush(struct vnode *vp) +{ + struct dquot *dq; + int i; + + mutex_enter(&dqlock); + for (i = 0; i <= dqhash; i++) + LIST_FOREACH(dq, &dqhashtbl[i], dq_hash) + KASSERT(dq->dq_ump->um_quotas[dq->dq_type] != vp); + mutex_exit(&dqlock); +} +#endif diff --git a/include/ufs/ufs/ufs_quota.h b/sys/ufs/ufs/ufs_quota.h similarity index 100% rename from include/ufs/ufs/ufs_quota.h rename to sys/ufs/ufs/ufs_quota.h diff --git a/sys/ufs/ufs/ufs_quota1.c b/sys/ufs/ufs/ufs_quota1.c new file mode 100644 index 000000000..4fdb57c95 --- /dev/null +++ b/sys/ufs/ufs/ufs_quota1.c @@ -0,0 +1,885 @@ +/* $NetBSD: ufs_quota1.c,v 1.6 2011/11/25 16:55:05 dholland Exp $ */ + +/* + * Copyright (c) 1982, 1986, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Robert Elz at The University of Melbourne. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ufs_quota1.c,v 1.6 2011/11/25 16:55:05 dholland Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static int chkdqchg(struct inode *, int64_t, kauth_cred_t, int); +static int chkiqchg(struct inode *, int32_t, kauth_cred_t, int); + +/* + * Update disk usage, and take corrective action. + */ +int +chkdq1(struct inode *ip, int64_t change, kauth_cred_t cred, int flags) +{ + struct dquot *dq; + int i; + int ncurblocks, error; + + if ((error = getinoquota(ip)) != 0) + return error; + if (change == 0) + return (0); + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + mutex_enter(&dq->dq_interlock); + ncurblocks = dq->dq_curblocks + change; + if (ncurblocks >= 0) + dq->dq_curblocks = ncurblocks; + else + dq->dq_curblocks = 0; + dq->dq_flags &= ~DQ_WARN(QL_BLOCK); + dq->dq_flags |= DQ_MOD; + mutex_exit(&dq->dq_interlock); + } + return (0); + } + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + if ((flags & FORCE) == 0 && + kauth_authorize_system(cred, KAUTH_SYSTEM_FS_QUOTA, + KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT, KAUTH_ARG(i), + KAUTH_ARG(QL_BLOCK), NULL) != 0) { + mutex_enter(&dq->dq_interlock); + error = chkdqchg(ip, change, cred, i); + mutex_exit(&dq->dq_interlock); + if (error != 0) + return (error); + } + } + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + mutex_enter(&dq->dq_interlock); + dq->dq_curblocks += change; + dq->dq_flags |= DQ_MOD; + mutex_exit(&dq->dq_interlock); + } + return (0); +} + +/* + * Check for a valid change to a users allocation. + * Issue an error message if appropriate. + */ +static int +chkdqchg(struct inode *ip, int64_t change, kauth_cred_t cred, int type) +{ + struct dquot *dq = ip->i_dquot[type]; + long ncurblocks = dq->dq_curblocks + change; + + KASSERT(mutex_owned(&dq->dq_interlock)); + /* + * If user would exceed their hard limit, disallow space allocation. + */ + if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) { + if ((dq->dq_flags & DQ_WARN(QL_BLOCK)) == 0 && + ip->i_uid == kauth_cred_geteuid(cred)) { + uprintf("\n%s: write failed, %s disk limit reached\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type]); + dq->dq_flags |= DQ_WARN(QL_BLOCK); + } + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow space + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { + if (dq->dq_curblocks < dq->dq_bsoftlimit) { + dq->dq_btime = + time_second + ip->i_ump->umq1_btime[type]; + if (ip->i_uid == kauth_cred_geteuid(cred)) + uprintf("\n%s: warning, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], "disk quota exceeded"); + return (0); + } + if (time_second > dq->dq_btime) { + if ((dq->dq_flags & DQ_WARN(QL_BLOCK)) == 0 && + ip->i_uid == kauth_cred_geteuid(cred)) { + uprintf("\n%s: write failed, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], + "disk quota exceeded for too long"); + dq->dq_flags |= DQ_WARN(QL_BLOCK); + } + return (EDQUOT); + } + } + return (0); +} + +/* + * Check the inode limit, applying corrective action. + */ +int +chkiq1(struct inode *ip, int32_t change, kauth_cred_t cred, int flags) +{ + struct dquot *dq; + int i; + int ncurinodes, error; + + if ((error = getinoquota(ip)) != 0) + return error; + if (change == 0) + return (0); + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + mutex_enter(&dq->dq_interlock); + ncurinodes = dq->dq_curinodes + change; + if (ncurinodes >= 0) + dq->dq_curinodes = ncurinodes; + else + dq->dq_curinodes = 0; + dq->dq_flags &= ~DQ_WARN(QL_FILE); + dq->dq_flags |= DQ_MOD; + mutex_exit(&dq->dq_interlock); + } + return (0); + } + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + if ((flags & FORCE) == 0 && kauth_authorize_system(cred, + KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT, + KAUTH_ARG(i), KAUTH_ARG(QL_FILE), NULL) != 0) { + mutex_enter(&dq->dq_interlock); + error = chkiqchg(ip, change, cred, i); + mutex_exit(&dq->dq_interlock); + if (error != 0) + return (error); + } + } + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + mutex_enter(&dq->dq_interlock); + dq->dq_curinodes += change; + dq->dq_flags |= DQ_MOD; + mutex_exit(&dq->dq_interlock); + } + return (0); +} + +/* + * Check for a valid change to a users allocation. + * Issue an error message if appropriate. + */ +static int +chkiqchg(struct inode *ip, int32_t change, kauth_cred_t cred, int type) +{ + struct dquot *dq = ip->i_dquot[type]; + long ncurinodes = dq->dq_curinodes + change; + + KASSERT(mutex_owned(&dq->dq_interlock)); + /* + * If user would exceed their hard limit, disallow inode allocation. + */ + if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { + if ((dq->dq_flags & DQ_WARN(QL_FILE)) == 0 && + ip->i_uid == kauth_cred_geteuid(cred)) { + uprintf("\n%s: write failed, %s inode limit reached\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type]); + dq->dq_flags |= DQ_WARN(QL_FILE); + } + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow inode + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { + if (dq->dq_curinodes < dq->dq_isoftlimit) { + dq->dq_itime = + time_second + ip->i_ump->umq1_itime[type]; + if (ip->i_uid == kauth_cred_geteuid(cred)) + uprintf("\n%s: warning, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], "inode quota exceeded"); + return (0); + } + if (time_second > dq->dq_itime) { + if ((dq->dq_flags & DQ_WARN(QL_FILE)) == 0 && + ip->i_uid == kauth_cred_geteuid(cred)) { + uprintf("\n%s: write failed, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], + "inode quota exceeded for too long"); + dq->dq_flags |= DQ_WARN(QL_FILE); + } + return (EDQUOT); + } + } + return (0); +} + +int +quota1_umount(struct mount *mp, int flags) +{ + int i, error; + struct ufsmount *ump = VFSTOUFS(mp); + struct lwp *l = curlwp; + + if ((ump->um_flags & UFS_QUOTA) == 0) + return 0; + + if ((error = vflush(mp, NULLVP, SKIPSYSTEM | flags)) != 0) + return (error); + + for (i = 0; i < MAXQUOTAS; i++) { + if (ump->um_quotas[i] != NULLVP) { + quota1_handle_cmd_quotaoff(l, ump, i); + } + } + return 0; +} + +/* + * Code to process quotactl commands. + */ + +/* + * set up a quota file for a particular file system. + */ +int +quota1_handle_cmd_quotaon(struct lwp *l, struct ufsmount *ump, int type, + const char *fname) +{ + struct mount *mp = ump->um_mountp; + struct vnode *vp, **vpp, *mvp; + struct dquot *dq; + int error; + struct pathbuf *pb; + struct nameidata nd; + + if (ump->um_flags & UFS_QUOTA2) { + uprintf("%s: quotas v2 already enabled\n", + mp->mnt_stat.f_mntonname); + return (EBUSY); + } + + if (mp->mnt_wapbl != NULL) { + printf("%s: quota v1 cannot be used with -o log\n", + mp->mnt_stat.f_mntonname); + return (EOPNOTSUPP); + } + + vpp = &ump->um_quotas[type]; + + pb = pathbuf_create(fname); + if (pb == NULL) { + return ENOMEM; + } + NDINIT(&nd, LOOKUP, FOLLOW, pb); + if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) { + pathbuf_destroy(pb); + return error; + } + vp = nd.ni_vp; + pathbuf_destroy(pb); + + VOP_UNLOCK(vp); + if (vp->v_type != VREG) { + (void) vn_close(vp, FREAD|FWRITE, l->l_cred); + return (EACCES); + } + if (*vpp != vp) + quota1_handle_cmd_quotaoff(l, ump, type); + mutex_enter(&dqlock); + while ((ump->umq1_qflags[type] & (QTF_CLOSING | QTF_OPENING)) != 0) + cv_wait(&dqcv, &dqlock); + ump->umq1_qflags[type] |= QTF_OPENING; + mutex_exit(&dqlock); + mp->mnt_flag |= MNT_QUOTA; + vp->v_vflag |= VV_SYSTEM; /* XXXSMP */ + *vpp = vp; + /* + * Save the credential of the process that turned on quotas. + * Set up the time limits for this quota. + */ + kauth_cred_hold(l->l_cred); + ump->um_cred[type] = l->l_cred; + ump->umq1_btime[type] = MAX_DQ_TIME; + ump->umq1_itime[type] = MAX_IQ_TIME; + if (dqget(NULLVP, 0, ump, type, &dq) == 0) { + if (dq->dq_btime > 0) + ump->umq1_btime[type] = dq->dq_btime; + if (dq->dq_itime > 0) + ump->umq1_itime[type] = dq->dq_itime; + dqrele(NULLVP, dq); + } + /* Allocate a marker vnode. */ + mvp = vnalloc(mp); + /* + * Search vnodes associated with this mount point, + * adding references to quota file being opened. + * NB: only need to add dquot's for inodes being modified. + */ + mutex_enter(&mntvnode_lock); +again: + for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) { + vmark(mvp, vp); + mutex_enter(vp->v_interlock); + if (VTOI(vp) == NULL || vp->v_mount != mp || vismarker(vp) || + vp->v_type == VNON || vp->v_writecount == 0 || + (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0) { + mutex_exit(vp->v_interlock); + continue; + } + mutex_exit(&mntvnode_lock); + if (vget(vp, LK_EXCLUSIVE)) { + mutex_enter(&mntvnode_lock); + (void)vunmark(mvp); + goto again; + } + if ((error = getinoquota(VTOI(vp))) != 0) { + vput(vp); + mutex_enter(&mntvnode_lock); + (void)vunmark(mvp); + break; + } + vput(vp); + mutex_enter(&mntvnode_lock); + } + mutex_exit(&mntvnode_lock); + vnfree(mvp); + + mutex_enter(&dqlock); + ump->umq1_qflags[type] &= ~QTF_OPENING; + cv_broadcast(&dqcv); + if (error == 0) + ump->um_flags |= UFS_QUOTA; + mutex_exit(&dqlock); + if (error) + quota1_handle_cmd_quotaoff(l, ump, type); + return (error); +} + +/* + * turn off disk quotas for a filesystem. + */ +int +quota1_handle_cmd_quotaoff(struct lwp *l, struct ufsmount *ump, int type) +{ + struct mount *mp = ump->um_mountp; + struct vnode *vp; + struct vnode *qvp, *mvp; + struct dquot *dq; + struct inode *ip; + kauth_cred_t cred; + int i, error; + + /* Allocate a marker vnode. */ + mvp = vnalloc(mp); + + mutex_enter(&dqlock); + while ((ump->umq1_qflags[type] & (QTF_CLOSING | QTF_OPENING)) != 0) + cv_wait(&dqcv, &dqlock); + if ((qvp = ump->um_quotas[type]) == NULLVP) { + mutex_exit(&dqlock); + vnfree(mvp); + return (0); + } + ump->umq1_qflags[type] |= QTF_CLOSING; + ump->um_flags &= ~UFS_QUOTA; + mutex_exit(&dqlock); + /* + * Search vnodes associated with this mount point, + * deleting any references to quota file being closed. + */ + mutex_enter(&mntvnode_lock); +again: + for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) { + vmark(mvp, vp); + mutex_enter(vp->v_interlock); + if (VTOI(vp) == NULL || vp->v_mount != mp || vismarker(vp) || + vp->v_type == VNON || + (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0) { + mutex_exit(vp->v_interlock); + continue; + } + mutex_exit(&mntvnode_lock); + if (vget(vp, LK_EXCLUSIVE)) { + mutex_enter(&mntvnode_lock); + (void)vunmark(mvp); + goto again; + } + ip = VTOI(vp); + dq = ip->i_dquot[type]; + ip->i_dquot[type] = NODQUOT; + dqrele(vp, dq); + vput(vp); + mutex_enter(&mntvnode_lock); + } + mutex_exit(&mntvnode_lock); +#ifdef DIAGNOSTIC + dqflush(qvp); +#endif + qvp->v_vflag &= ~VV_SYSTEM; + error = vn_close(qvp, FREAD|FWRITE, l->l_cred); + mutex_enter(&dqlock); + ump->um_quotas[type] = NULLVP; + cred = ump->um_cred[type]; + ump->um_cred[type] = NOCRED; + for (i = 0; i < MAXQUOTAS; i++) + if (ump->um_quotas[i] != NULLVP) + break; + ump->umq1_qflags[type] &= ~QTF_CLOSING; + cv_broadcast(&dqcv); + mutex_exit(&dqlock); + kauth_cred_free(cred); + if (i == MAXQUOTAS) + mp->mnt_flag &= ~MNT_QUOTA; + return (error); +} + +int +quota1_handle_cmd_get(struct ufsmount *ump, int type, int id, + int defaultq, prop_array_t replies) +{ + struct dquot *dq; + struct quotaval qv[QUOTA_NLIMITS]; + prop_dictionary_t dict; + int error; + uint64_t *valuesp[QUOTA_NLIMITS]; + valuesp[QUOTA_LIMIT_BLOCK] = &qv[QUOTA_LIMIT_BLOCK].qv_hardlimit; + valuesp[QUOTA_LIMIT_FILE] = &qv[QUOTA_LIMIT_FILE].qv_hardlimit; + + + if (ump->um_quotas[type] == NULLVP) + return ENODEV; + + if (defaultq) { /* we want the grace period of id 0 */ + if ((error = dqget(NULLVP, 0, ump, type, &dq)) != 0) + return error; + + } else { + if ((error = dqget(NULLVP, id, ump, type, &dq)) != 0) + return error; + } + dqblk_to_quotaval(&dq->dq_un.dq1_dqb, qv); + dqrele(NULLVP, dq); + if (defaultq) { + if (qv[QUOTA_LIMIT_BLOCK].qv_expiretime > 0) + qv[QUOTA_LIMIT_BLOCK].qv_grace = + qv[QUOTA_LIMIT_BLOCK].qv_expiretime; + else + qv[QUOTA_LIMIT_BLOCK].qv_grace = MAX_DQ_TIME; + if (qv[QUOTA_LIMIT_FILE].qv_expiretime > 0) + qv[QUOTA_LIMIT_FILE].qv_grace = + qv[QUOTA_LIMIT_FILE].qv_expiretime; + else + qv[QUOTA_LIMIT_FILE].qv_grace = MAX_DQ_TIME; + } + dict = quota64toprop(id, defaultq, valuesp, + ufs_quota_entry_names, UFS_QUOTA_NENTRIES, + ufs_quota_limit_names, QUOTA_NLIMITS); + if (dict == NULL) + return ENOMEM; + if (!prop_array_add_and_rel(replies, dict)) + return ENOMEM; + return 0; +} + +int +quota1_handle_cmd_set(struct ufsmount *ump, int type, int id, + int defaultq, prop_dictionary_t data) +{ + struct dquot *dq; + struct dqblk dqb; + int error; + uint64_t bval[2]; + uint64_t ival[2]; + const char *val_limitsonly_grace[] = {QUOTADICT_LIMIT_GTIME}; +#define Q1_GTIME 0 + const char *val_limitsonly_softhard[] = + {QUOTADICT_LIMIT_SOFT, QUOTADICT_LIMIT_HARD}; +#define Q1_SOFT 0 +#define Q1_HARD 1 + + uint64_t *valuesp[QUOTA_NLIMITS]; + valuesp[QUOTA_LIMIT_BLOCK] = bval; + valuesp[QUOTA_LIMIT_FILE] = ival; + + if (ump->um_quotas[type] == NULLVP) + return ENODEV; + + if (defaultq) { + /* just update grace times */ + error = proptoquota64(data, valuesp, val_limitsonly_grace, 1, + ufs_quota_limit_names, QUOTA_NLIMITS); + if (error) + return error; + if ((error = dqget(NULLVP, id, ump, type, &dq)) != 0) + return error; + mutex_enter(&dq->dq_interlock); + if (bval[Q1_GTIME] > 0) + ump->umq1_btime[type] = dq->dq_btime = + bval[Q1_GTIME]; + if (ival[Q1_GTIME] > 0) + ump->umq1_itime[type] = dq->dq_itime = + ival[Q1_GTIME]; + mutex_exit(&dq->dq_interlock); + dq->dq_flags |= DQ_MOD; + dqrele(NULLVP, dq); + return 0; + } + error = proptoquota64(data, valuesp, val_limitsonly_softhard, 2, + ufs_quota_limit_names, QUOTA_NLIMITS); + if (error) + return error; + + if ((error = dqget(NULLVP, id, ump, type, &dq)) != 0) + return (error); + mutex_enter(&dq->dq_interlock); + /* + * Copy all but the current values. + * Reset time limit if previously had no soft limit or were + * under it, but now have a soft limit and are over it. + */ + dqb.dqb_curblocks = dq->dq_curblocks; + dqb.dqb_curinodes = dq->dq_curinodes; + dqb.dqb_btime = dq->dq_btime; + dqb.dqb_itime = dq->dq_itime; + dqb.dqb_bsoftlimit = (bval[Q1_SOFT] == UQUAD_MAX) ? 0 : bval[Q1_SOFT]; + dqb.dqb_bhardlimit = (bval[Q1_HARD] == UQUAD_MAX) ? 0 : bval[Q1_HARD]; + dqb.dqb_isoftlimit = (ival[Q1_SOFT] == UQUAD_MAX) ? 0 : ival[Q1_SOFT]; + dqb.dqb_ihardlimit = (ival[Q1_HARD] == UQUAD_MAX) ? 0 : ival[Q1_HARD]; + if (dq->dq_id == 0) { + /* also update grace time if available */ + if (proptoquota64(data, valuesp, val_limitsonly_grace, 1, + ufs_quota_limit_names, QUOTA_NLIMITS) == 0) { + if (bval[Q1_GTIME] > 0) + ump->umq1_btime[type] = dqb.dqb_btime = + bval[Q1_GTIME]; + if (ival[Q1_GTIME] > 0) + ump->umq1_itime[type] = dqb.dqb_itime = + ival[Q1_GTIME]; + } + } + if (dqb.dqb_bsoftlimit && + dq->dq_curblocks >= dqb.dqb_bsoftlimit && + (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit)) + dqb.dqb_btime = time_second + ump->umq1_btime[type]; + if (dqb.dqb_isoftlimit && + dq->dq_curinodes >= dqb.dqb_isoftlimit && + (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) + dqb.dqb_itime = time_second + ump->umq1_itime[type]; + dq->dq_un.dq1_dqb = dqb; + if (dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_WARN(QL_BLOCK); + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_WARN(QL_FILE); + if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && + dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) + dq->dq_flags |= DQ_FAKE; + else + dq->dq_flags &= ~DQ_FAKE; + dq->dq_flags |= DQ_MOD; + mutex_exit(&dq->dq_interlock); + dqrele(NULLVP, dq); + return (0); +} + + +#if 0 +/* + * Q_SETQUOTA - assign an entire dqblk structure. + */ +int +setquota1(struct mount *mp, u_long id, int type, struct dqblk *dqb) +{ + struct dquot *dq; + struct dquot *ndq; + struct ufsmount *ump = VFSTOUFS(mp); + + + if ((error = dqget(NULLVP, id, ump, type, &ndq)) != 0) + return (error); + dq = ndq; + mutex_enter(&dq->dq_interlock); + /* + * Copy all but the current values. + * Reset time limit if previously had no soft limit or were + * under it, but now have a soft limit and are over it. + */ + dqb->dqb_curblocks = dq->dq_curblocks; + dqb->dqb_curinodes = dq->dq_curinodes; + if (dq->dq_id != 0) { + dqb->dqb_btime = dq->dq_btime; + dqb->dqb_itime = dq->dq_itime; + } + if (dqb->dqb_bsoftlimit && + dq->dq_curblocks >= dqb->dqb_bsoftlimit && + (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit)) + dqb->dqb_btime = time_second + ump->umq1_btime[type]; + if (dqb->dqb_isoftlimit && + dq->dq_curinodes >= dqb->dqb_isoftlimit && + (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) + dqb->dqb_itime = time_second + ump->umq1_itime[type]; + dq->dq_un.dq1_dqb = *dqb; + if (dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_WARN(QL_BLOCK); + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_WARN(QL_FILE); + if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && + dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) + dq->dq_flags |= DQ_FAKE; + else + dq->dq_flags &= ~DQ_FAKE; + dq->dq_flags |= DQ_MOD; + mutex_exit(&dq->dq_interlock); + dqrele(NULLVP, dq); + return (0); +} + +/* + * Q_SETUSE - set current inode and block usage. + */ +int +setuse(struct mount *mp, u_long id, int type, void *addr) +{ + struct dquot *dq; + struct ufsmount *ump = VFSTOUFS(mp); + struct dquot *ndq; + struct dqblk usage; + int error; + + error = copyin(addr, (void *)&usage, sizeof (struct dqblk)); + if (error) + return (error); + if ((error = dqget(NULLVP, id, ump, type, &ndq)) != 0) + return (error); + dq = ndq; + mutex_enter(&dq->dq_interlock); + /* + * Reset time limit if have a soft limit and were + * previously under it, but are now over it. + */ + if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit && + usage.dqb_curblocks >= dq->dq_bsoftlimit) + dq->dq_btime = time_second + ump->umq1_btime[type]; + if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit && + usage.dqb_curinodes >= dq->dq_isoftlimit) + dq->dq_itime = time_second + ump->umq1_itime[type]; + dq->dq_curblocks = usage.dqb_curblocks; + dq->dq_curinodes = usage.dqb_curinodes; + if (dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_WARN(QL_BLOCK); + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_WARN(QL_FILE); + dq->dq_flags |= DQ_MOD; + mutex_exit(&dq->dq_interlock); + dqrele(NULLVP, dq); + return (0); +} +#endif + +/* + * Q_SYNC - sync quota files to disk. + */ +int +q1sync(struct mount *mp) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct vnode *vp, *mvp; + struct dquot *dq; + int i, error; + + /* + * Check if the mount point has any quotas. + * If not, simply return. + */ + for (i = 0; i < MAXQUOTAS; i++) + if (ump->um_quotas[i] != NULLVP) + break; + if (i == MAXQUOTAS) + return (0); + + /* Allocate a marker vnode. */ + mvp = vnalloc(mp); + + /* + * Search vnodes associated with this mount point, + * synchronizing any modified dquot structures. + */ + mutex_enter(&mntvnode_lock); + again: + for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) { + vmark(mvp, vp); + mutex_enter(vp->v_interlock); + if (VTOI(vp) == NULL || vp->v_mount != mp || vismarker(vp) || + vp->v_type == VNON || + (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0) { + mutex_exit(vp->v_interlock); + continue; + } + mutex_exit(&mntvnode_lock); + error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT); + if (error) { + mutex_enter(&mntvnode_lock); + if (error == ENOENT) { + (void)vunmark(mvp); + goto again; + } + continue; + } + for (i = 0; i < MAXQUOTAS; i++) { + dq = VTOI(vp)->i_dquot[i]; + if (dq == NODQUOT) + continue; + mutex_enter(&dq->dq_interlock); + if (dq->dq_flags & DQ_MOD) + dq1sync(vp, dq); + mutex_exit(&dq->dq_interlock); + } + vput(vp); + mutex_enter(&mntvnode_lock); + } + mutex_exit(&mntvnode_lock); + vnfree(mvp); + return (0); +} + +/* + * Obtain a dquot structure for the specified identifier and quota file + * reading the information from the file if necessary. + */ +int +dq1get(struct vnode *dqvp, u_long id, struct ufsmount *ump, int type, + struct dquot *dq) +{ + struct iovec aiov; + struct uio auio; + int error; + + KASSERT(mutex_owned(&dq->dq_interlock)); + vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (void *)&dq->dq_un.dq1_dqb; + aiov.iov_len = sizeof (struct dqblk); + auio.uio_resid = sizeof (struct dqblk); + auio.uio_offset = (off_t)(id * sizeof (struct dqblk)); + auio.uio_rw = UIO_READ; + UIO_SETUP_SYSSPACE(&auio); + error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]); + if (auio.uio_resid == sizeof(struct dqblk) && error == 0) + memset((void *)&dq->dq_un.dq1_dqb, 0, sizeof(struct dqblk)); + VOP_UNLOCK(dqvp); + /* + * I/O error in reading quota file, release + * quota structure and reflect problem to caller. + */ + if (error) + return (error); + /* + * Check for no limit to enforce. + * Initialize time values if necessary. + */ + if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && + dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) + dq->dq_flags |= DQ_FAKE; + if (dq->dq_id != 0) { + if (dq->dq_btime == 0) + dq->dq_btime = time_second + ump->umq1_btime[type]; + if (dq->dq_itime == 0) + dq->dq_itime = time_second + ump->umq1_itime[type]; + } + return (0); +} + +/* + * Update the disk quota in the quota file. + */ +int +dq1sync(struct vnode *vp, struct dquot *dq) +{ + struct vnode *dqvp; + struct iovec aiov; + struct uio auio; + int error; + + if (dq == NODQUOT) + panic("dq1sync: dquot"); + KASSERT(mutex_owned(&dq->dq_interlock)); + if ((dq->dq_flags & DQ_MOD) == 0) + return (0); + if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP) + panic("dq1sync: file"); + KASSERT(dqvp != vp); + vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (void *)&dq->dq_un.dq1_dqb; + aiov.iov_len = sizeof (struct dqblk); + auio.uio_resid = sizeof (struct dqblk); + auio.uio_offset = (off_t)(dq->dq_id * sizeof (struct dqblk)); + auio.uio_rw = UIO_WRITE; + UIO_SETUP_SYSSPACE(&auio); + error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]); + if (auio.uio_resid && error == 0) + error = EIO; + dq->dq_flags &= ~DQ_MOD; + VOP_UNLOCK(dqvp); + return (error); +} diff --git a/sys/ufs/ufs/ufs_quota2.c b/sys/ufs/ufs/ufs_quota2.c new file mode 100644 index 000000000..823e398bd --- /dev/null +++ b/sys/ufs/ufs/ufs_quota2.c @@ -0,0 +1,1012 @@ +/* $NetBSD: ufs_quota2.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $ */ +/*- + * Copyright (c) 2010 Manuel Bouyer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ufs_quota2.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * LOCKING: + * Data in the entries are protected by the associated struct dquot's + * dq_interlock (this means we can't read or change a quota entry without + * grabing a dquot for it). + * The header and lists (including pointers in the data entries, and q2e_uid) + * are protected by the global dqlock. + * the locking order is dq_interlock -> dqlock + */ + +static int quota2_bwrite(struct mount *, struct buf *); +static int getinoquota2(struct inode *, bool, bool, struct buf **, + struct quota2_entry **); +static int getq2h(struct ufsmount *, int, struct buf **, + struct quota2_header **, int); +static int getq2e(struct ufsmount *, int, daddr_t, int, struct buf **, + struct quota2_entry **, int); +static int quota2_walk_list(struct ufsmount *, struct buf *, int, + uint64_t *, int, void *, + int (*func)(struct ufsmount *, uint64_t *, struct quota2_entry *, + uint64_t, void *)); + +static int quota2_dict_update_q2e_limits(prop_dictionary_t, + struct quota2_entry *); +static prop_dictionary_t q2etoprop(struct quota2_entry *, int); + +static const char *limnames[] = INITQLNAMES; + +static int +quota2_dict_update_q2e_limits(prop_dictionary_t data, + struct quota2_entry *q2e) +{ + const char *val_limitsonly_names[] = INITQVNAMES_LIMITSONLY; + + int i, error; + prop_dictionary_t val; + + for (i = 0; i < N_QL; i++) { + if (!prop_dictionary_get_dict(data, limnames[i], &val)) + return EINVAL; + error = quotaprop_dict_get_uint64(val, + &q2e->q2e_val[i].q2v_hardlimit, + val_limitsonly_names, N_QV, true); + if (error) + return error; + } + return 0; +} +static prop_dictionary_t +q2etoprop(struct quota2_entry *q2e, int def) +{ + const char *val_names[] = INITQVNAMES_ALL; + prop_dictionary_t dict1 = prop_dictionary_create(); + prop_dictionary_t dict2; + int i; + + if (dict1 == NULL) + return NULL; + + if (def) { + if (!prop_dictionary_set_cstring_nocopy(dict1, "id", + "default")) { + goto err; + } + } else { + if (!prop_dictionary_set_uint32(dict1, "id", q2e->q2e_uid)) { + goto err; + } + } + for (i = 0; i < N_QL; i++) { + dict2 = limits64toprop(&q2e->q2e_val[i].q2v_hardlimit, + val_names, N_QV); + if (dict2 == NULL) + goto err; + if (!prop_dictionary_set_and_rel(dict1, limnames[i], dict2)) + goto err; + } + return dict1; + +err: + prop_object_release(dict1); + return NULL; +} + + +static int +quota2_bwrite(struct mount *mp, struct buf *bp) +{ + if (mp->mnt_flag & MNT_SYNCHRONOUS) + return bwrite(bp); + else { + bdwrite(bp); + return 0; + } +} + +static int +getq2h(struct ufsmount *ump, int type, + struct buf **bpp, struct quota2_header **q2hp, int flags) +{ +#ifdef FFS_EI + const int needswap = UFS_MPNEEDSWAP(ump); +#endif + int error; + struct buf *bp; + struct quota2_header *q2h; + + KASSERT(mutex_owned(&dqlock)); + error = bread(ump->um_quotas[type], 0, ump->umq2_bsize, + ump->um_cred[type], flags, &bp); + if (error) + return error; + if (bp->b_resid != 0) + panic("dq2get: %s quota file truncated", quotatypes[type]); + + q2h = (void *)bp->b_data; + if (ufs_rw32(q2h->q2h_magic_number, needswap) != Q2_HEAD_MAGIC || + q2h->q2h_type != type) + panic("dq2get: corrupted %s quota header", quotatypes[type]); + *bpp = bp; + *q2hp = q2h; + return 0; +} + +static int +getq2e(struct ufsmount *ump, int type, daddr_t lblkno, int blkoffset, + struct buf **bpp, struct quota2_entry **q2ep, int flags) +{ + int error; + struct buf *bp; + + if (blkoffset & (sizeof(uint64_t) - 1)) { + panic("dq2get: %s quota file corrupted", + quotatypes[type]); + } + error = bread(ump->um_quotas[type], lblkno, ump->umq2_bsize, + ump->um_cred[type], flags, &bp); + if (error) + return error; + if (bp->b_resid != 0) { + panic("dq2get: %s quota file corrupted", + quotatypes[type]); + } + *q2ep = (void *)((char *)bp->b_data + blkoffset); + *bpp = bp; + return 0; +} + +/* walk a quota entry list, calling the callback for each entry */ +#define Q2WL_ABORT 0x10000000 + +static int +quota2_walk_list(struct ufsmount *ump, struct buf *hbp, int type, + uint64_t *offp, int flags, void *a, + int (*func)(struct ufsmount *, uint64_t *, struct quota2_entry *, uint64_t, void *)) +{ +#ifdef FFS_EI + const int needswap = UFS_MPNEEDSWAP(ump); +#endif + daddr_t off = ufs_rw64(*offp, needswap); + struct buf *bp, *obp = hbp; + int ret = 0, ret2 = 0; + struct quota2_entry *q2e; + daddr_t lblkno, blkoff, olblkno = 0; + + KASSERT(mutex_owner(&dqlock)); + + while (off != 0) { + lblkno = (off >> ump->um_mountp->mnt_fs_bshift); + blkoff = (off & ump->umq2_bmask); + if (lblkno == 0) { + /* in the header block */ + bp = hbp; + } else if (lblkno == olblkno) { + /* still in the same buf */ + bp = obp; + } else { + ret = bread(ump->um_quotas[type], lblkno, + ump->umq2_bsize, + ump->um_cred[type], flags, &bp); + if (ret) + return ret; + if (bp->b_resid != 0) { + panic("quota2_walk_list: %s quota file corrupted", + quotatypes[type]); + } + } + q2e = (void *)((char *)(bp->b_data) + blkoff); + ret = (*func)(ump, offp, q2e, off, a); + if (off != ufs_rw64(*offp, needswap)) { + /* callback changed parent's pointer, redo */ + off = ufs_rw64(*offp, needswap); + if (bp != hbp && bp != obp) + ret2 = bwrite(bp); + } else { + /* parent if now current */ + if (obp != bp && obp != hbp) { + if (flags & B_MODIFY) + ret2 = bwrite(obp); + else + brelse(obp, 0); + } + obp = bp; + olblkno = lblkno; + offp = &(q2e->q2e_next); + off = ufs_rw64(*offp, needswap); + } + if (ret) + break; + if (ret2) { + ret = ret2; + break; + } + } + if (obp != hbp) { + if (flags & B_MODIFY) + ret2 = bwrite(obp); + else + brelse(obp, 0); + } + if (ret & Q2WL_ABORT) + return 0; + if (ret == 0) + return ret2; + return ret; +} + +int +quota2_umount(struct mount *mp, int flags) +{ + int i, error; + struct ufsmount *ump = VFSTOUFS(mp); + + if ((ump->um_flags & UFS_QUOTA2) == 0) + return 0; + + for (i = 0; i < MAXQUOTAS; i++) { + if (ump->um_quotas[i] != NULLVP) { + error = vn_close(ump->um_quotas[i], FREAD|FWRITE, + ump->um_cred[i]); + if (error) { + printf("quota2_umount failed: close(%p) %d\n", + ump->um_quotas[i], error); + return error; + } + } + ump->um_quotas[i] = NULLVP; + } + return 0; +} + +static int +quota2_q2ealloc(struct ufsmount *ump, int type, uid_t uid, struct dquot *dq, + struct buf **bpp, struct quota2_entry **q2ep) +{ + int error, error2; + struct buf *hbp, *bp; + struct quota2_header *q2h; + struct quota2_entry *q2e; + daddr_t offset; + u_long hash_mask; + const int needswap = UFS_MPNEEDSWAP(ump); + + KASSERT(mutex_owned(&dq->dq_interlock)); + KASSERT(mutex_owned(&dqlock)); + error = getq2h(ump, type, &hbp, &q2h, B_MODIFY); + if (error) + return error; + offset = ufs_rw64(q2h->q2h_free, needswap); + if (offset == 0) { + struct vnode *vp = ump->um_quotas[type]; + struct inode *ip = VTOI(vp); + uint64_t size = ip->i_size; + /* need to alocate a new disk block */ + error = UFS_BALLOC(vp, size, ump->umq2_bsize, + ump->um_cred[type], B_CLRBUF | B_SYNC, &bp); + if (error) { + brelse(hbp, 0); + return error; + } + KASSERT((ip->i_size % ump->umq2_bsize) == 0); + ip->i_size += ump->umq2_bsize; + DIP_ASSIGN(ip, size, ip->i_size); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + uvm_vnp_setsize(vp, ip->i_size); + quota2_addfreeq2e(q2h, bp->b_data, size, ump->umq2_bsize, + needswap); + error = bwrite(bp); + error2 = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); + if (error || error2) { + brelse(hbp, 0); + if (error) + return error; + return error2; + } + offset = ufs_rw64(q2h->q2h_free, needswap); + KASSERT(offset != 0); + } + dq->dq2_lblkno = (offset >> ump->um_mountp->mnt_fs_bshift); + dq->dq2_blkoff = (offset & ump->umq2_bmask); + if (dq->dq2_lblkno == 0) { + bp = hbp; + q2e = (void *)((char *)bp->b_data + dq->dq2_blkoff); + } else { + error = getq2e(ump, type, dq->dq2_lblkno, + dq->dq2_blkoff, &bp, &q2e, B_MODIFY); + if (error) { + brelse(hbp, 0); + return error; + } + } + hash_mask = ((1 << q2h->q2h_hash_shift) - 1); + /* remove from free list */ + q2h->q2h_free = q2e->q2e_next; + + memcpy(q2e, &q2h->q2h_defentry, sizeof(*q2e)); + q2e->q2e_uid = ufs_rw32(uid, needswap); + /* insert in hash list */ + q2e->q2e_next = q2h->q2h_entries[uid & hash_mask]; + q2h->q2h_entries[uid & hash_mask] = ufs_rw64(offset, needswap); + if (hbp != bp) { + bwrite(hbp); + } + *q2ep = q2e; + *bpp = bp; + return 0; +} + +static int +getinoquota2(struct inode *ip, bool alloc, bool modify, struct buf **bpp, + struct quota2_entry **q2ep) +{ + int error; + int i; + struct dquot *dq; + struct ufsmount *ump = ip->i_ump; + u_int32_t ino_ids[MAXQUOTAS]; + + error = getinoquota(ip); + if (error) + return error; + + if (alloc) { + UFS_WAPBL_JLOCK_ASSERT(ump->um_mountp); + } + ino_ids[USRQUOTA] = ip->i_uid; + ino_ids[GRPQUOTA] = ip->i_gid; + /* first get the interlock for all dquot */ + for (i = 0; i < MAXQUOTAS; i++) { + dq = ip->i_dquot[i]; + if (dq == NODQUOT) + continue; + mutex_enter(&dq->dq_interlock); + } + /* now get the corresponding quota entry */ + for (i = 0; i < MAXQUOTAS; i++) { + bpp[i] = NULL; + q2ep[i] = NULL; + dq = ip->i_dquot[i]; + if (dq == NODQUOT) + continue; + if (__predict_false(ump->um_quotas[i] == NULL)) { + /* + * quotas have been turned off. This can happen + * at umount time. + */ + mutex_exit(&dq->dq_interlock); + dqrele(NULLVP, dq); + ip->i_dquot[i] = NULL; + continue; + } + + if ((dq->dq2_lblkno | dq->dq2_blkoff) == 0) { + if (!alloc) { + continue; + } + /* need to alloc a new on-disk quot */ + mutex_enter(&dqlock); + error = quota2_q2ealloc(ump, i, ino_ids[i], dq, + &bpp[i], &q2ep[i]); + mutex_exit(&dqlock); + if (error) + return error; + } else { + error = getq2e(ump, i, dq->dq2_lblkno, + dq->dq2_blkoff, &bpp[i], &q2ep[i], + modify ? B_MODIFY : 0); + if (error) + return error; + } + } + return 0; +} + +static int +quota2_check(struct inode *ip, int vtype, int64_t change, kauth_cred_t cred, + int flags) +{ + int error; + struct buf *bp[MAXQUOTAS]; + struct quota2_entry *q2e[MAXQUOTAS]; + struct quota2_val *q2vp; + struct dquot *dq; + uint64_t ncurblks; + struct ufsmount *ump = ip->i_ump; + struct mount *mp = ump->um_mountp; + const int needswap = UFS_MPNEEDSWAP(ump); + int i; + + if ((error = getinoquota2(ip, change > 0, change != 0, bp, q2e)) != 0) + return error; + if (change == 0) { + for (i = 0; i < MAXQUOTAS; i++) { + dq = ip->i_dquot[i]; + if (dq == NODQUOT) + continue; + if (bp[i]) + brelse(bp[i], 0); + mutex_exit(&dq->dq_interlock); + } + return 0; + } + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + dq = ip->i_dquot[i]; + if (dq == NODQUOT) + continue; + if (q2e[i] == NULL) { + mutex_exit(&dq->dq_interlock); + continue; + } + q2vp = &q2e[i]->q2e_val[vtype]; + ncurblks = ufs_rw64(q2vp->q2v_cur, needswap); + if (ncurblks < -change) + ncurblks = 0; + else + ncurblks += change; + q2vp->q2v_cur = ufs_rw64(ncurblks, needswap); + quota2_bwrite(mp, bp[i]); + mutex_exit(&dq->dq_interlock); + } + return 0; + } + /* see if the allocation is allowed */ + for (i = 0; i < MAXQUOTAS; i++) { + struct quota2_val q2v; + int ql_stat; + dq = ip->i_dquot[i]; + if (dq == NODQUOT) + continue; + KASSERT(q2e[i] != NULL); + quota2_ufs_rwq2v(&q2e[i]->q2e_val[vtype], &q2v, needswap); + ql_stat = quota2_check_limit(&q2v, change, time_second); + + if ((flags & FORCE) == 0 && + kauth_authorize_system(cred, KAUTH_SYSTEM_FS_QUOTA, + KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT, + KAUTH_ARG(i), KAUTH_ARG(vtype), NULL) != 0) { + /* enforce this limit */ + switch(QL_STATUS(ql_stat)) { + case QL_S_DENY_HARD: + if ((dq->dq_flags & DQ_WARN(vtype)) == 0) { + uprintf("\n%s: write failed, %s %s " + "limit reached\n", + mp->mnt_stat.f_mntonname, + quotatypes[i], limnames[vtype]); + dq->dq_flags |= DQ_WARN(vtype); + } + error = EDQUOT; + break; + case QL_S_DENY_GRACE: + if ((dq->dq_flags & DQ_WARN(vtype)) == 0) { + uprintf("\n%s: write failed, %s %s " + "limit reached\n", + mp->mnt_stat.f_mntonname, + quotatypes[i], limnames[vtype]); + dq->dq_flags |= DQ_WARN(vtype); + } + error = EDQUOT; + break; + case QL_S_ALLOW_SOFT: + if ((dq->dq_flags & DQ_WARN(vtype)) == 0) { + uprintf("\n%s: warning, %s %s " + "quota exceeded\n", + mp->mnt_stat.f_mntonname, + quotatypes[i], limnames[vtype]); + dq->dq_flags |= DQ_WARN(vtype); + } + break; + } + } + /* + * always do this; we don't know if the allocation will + * succed or not in the end. if we don't do the allocation + * q2v_time will be ignored anyway + */ + if (ql_stat & QL_F_CROSS) { + q2v.q2v_time = time_second + q2v.q2v_grace; + quota2_ufs_rwq2v(&q2v, &q2e[i]->q2e_val[vtype], + needswap); + } + } + + /* now do the allocation if allowed */ + for (i = 0; i < MAXQUOTAS; i++) { + dq = ip->i_dquot[i]; + if (dq == NODQUOT) + continue; + KASSERT(q2e[i] != NULL); + if (error == 0) { + q2vp = &q2e[i]->q2e_val[vtype]; + ncurblks = ufs_rw64(q2vp->q2v_cur, needswap); + q2vp->q2v_cur = ufs_rw64(ncurblks + change, needswap); + quota2_bwrite(mp, bp[i]); + } else + brelse(bp[i], 0); + mutex_exit(&dq->dq_interlock); + } + return error; +} + +int +chkdq2(struct inode *ip, int64_t change, kauth_cred_t cred, int flags) +{ + return quota2_check(ip, QL_BLOCK, change, cred, flags); +} + +int +chkiq2(struct inode *ip, int32_t change, kauth_cred_t cred, int flags) +{ + return quota2_check(ip, QL_FILE, change, cred, flags); +} + +int +quota2_handle_cmd_set(struct ufsmount *ump, int type, int id, + int defaultq, prop_dictionary_t data) +{ + int error; + struct dquot *dq; + struct quota2_header *q2h; + struct quota2_entry q2e, *q2ep; + struct buf *bp; + const int needswap = UFS_MPNEEDSWAP(ump); + + if (ump->um_quotas[type] == NULLVP) + return ENODEV; + error = UFS_WAPBL_BEGIN(ump->um_mountp); + if (error) + return error; + + if (defaultq) { + mutex_enter(&dqlock); + error = getq2h(ump, type, &bp, &q2h, B_MODIFY); + if (error) { + mutex_exit(&dqlock); + goto out_wapbl; + } + quota2_ufs_rwq2e(&q2h->q2h_defentry, &q2e, needswap); + error = quota2_dict_update_q2e_limits(data, &q2e); + if (error) { + mutex_exit(&dqlock); + brelse(bp, 0); + goto out_wapbl; + } + quota2_ufs_rwq2e(&q2e, &q2h->q2h_defentry, needswap); + mutex_exit(&dqlock); + quota2_bwrite(ump->um_mountp, bp); + goto out_wapbl; + } + + error = dqget(NULLVP, id, ump, type, &dq); + if (error) + goto out_wapbl; + + mutex_enter(&dq->dq_interlock); + if (dq->dq2_lblkno == 0 && dq->dq2_blkoff == 0) { + /* need to alloc a new on-disk quot */ + mutex_enter(&dqlock); + error = quota2_q2ealloc(ump, type, id, dq, &bp, &q2ep); + mutex_exit(&dqlock); + } else { + error = getq2e(ump, type, dq->dq2_lblkno, dq->dq2_blkoff, + &bp, &q2ep, B_MODIFY); + } + if (error) + goto out_il; + + quota2_ufs_rwq2e(q2ep, &q2e, needswap); + error = quota2_dict_update_q2e_limits(data, &q2e); + if (error) { + brelse(bp, 0); + goto out_il; + } + quota2_ufs_rwq2e(&q2e, q2ep, needswap); + quota2_bwrite(ump->um_mountp, bp); + +out_il: + mutex_exit(&dq->dq_interlock); + dqrele(NULLVP, dq); +out_wapbl: + UFS_WAPBL_END(ump->um_mountp); + return error; +} + +struct dq2clear_callback { + uid_t id; + struct dquot *dq; + struct quota2_header *q2h; +}; + +static int +dq2clear_callback(struct ufsmount *ump, uint64_t *offp, struct quota2_entry *q2e, + uint64_t off, void *v) +{ + struct dq2clear_callback *c = v; +#ifdef FFS_EI + const int needswap = UFS_MPNEEDSWAP(ump); +#endif + uint64_t myoff; + + if (ufs_rw32(q2e->q2e_uid, needswap) == c->id) { + KASSERT(mutex_owned(&c->dq->dq_interlock)); + c->dq->dq2_lblkno = 0; + c->dq->dq2_blkoff = 0; + myoff = *offp; + /* remove from hash list */ + *offp = q2e->q2e_next; + /* add to free list */ + q2e->q2e_next = c->q2h->q2h_free; + c->q2h->q2h_free = myoff; + return Q2WL_ABORT; + } + return 0; +} +int +quota2_handle_cmd_clear(struct ufsmount *ump, int type, int id, + int defaultq, prop_dictionary_t data) +{ + int error, i; + struct dquot *dq; + struct quota2_header *q2h; + struct quota2_entry q2e, *q2ep; + struct buf *hbp, *bp; + u_long hash_mask; + struct dq2clear_callback c; + + if (ump->um_quotas[type] == NULLVP) + return ENODEV; + if (defaultq) + return EOPNOTSUPP; + + /* get the default entry before locking the entry's buffer */ + mutex_enter(&dqlock); + error = getq2h(ump, type, &hbp, &q2h, 0); + if (error) { + mutex_exit(&dqlock); + return error; + } + /* we'll copy to another disk entry, so no need to swap */ + memcpy(&q2e, &q2h->q2h_defentry, sizeof(q2e)); + mutex_exit(&dqlock); + brelse(hbp, 0); + + error = dqget(NULLVP, id, ump, type, &dq); + if (error) + return error; + + mutex_enter(&dq->dq_interlock); + if (dq->dq2_lblkno == 0 && dq->dq2_blkoff == 0) { + /* already clear, nothing to do */ + error = ENOENT; + goto out_il; + } + error = UFS_WAPBL_BEGIN(ump->um_mountp); + if (error) + goto out_dq; + + error = getq2e(ump, type, dq->dq2_lblkno, dq->dq2_blkoff, + &bp, &q2ep, B_MODIFY); + if (error) + goto out_wapbl; + + if (q2ep->q2e_val[QL_BLOCK].q2v_cur != 0 || + q2ep->q2e_val[QL_FILE].q2v_cur != 0) { + /* can't free this entry; revert to default */ + for (i = 0; i < N_QL; i++) { + q2ep->q2e_val[i].q2v_softlimit = + q2e.q2e_val[i].q2v_softlimit; + q2ep->q2e_val[i].q2v_hardlimit = + q2e.q2e_val[i].q2v_hardlimit; + q2ep->q2e_val[i].q2v_grace = + q2e.q2e_val[i].q2v_grace; + q2ep->q2e_val[i].q2v_time = 0; + } + quota2_bwrite(ump->um_mountp, bp); + goto out_wapbl; + } + /* we can free it. release bp so we can walk the list */ + brelse(bp, 0); + mutex_enter(&dqlock); + error = getq2h(ump, type, &hbp, &q2h, 0); + if (error) + goto out_dqlock; + + hash_mask = ((1 << q2h->q2h_hash_shift) - 1); + c.dq = dq; + c.id = id; + c.q2h = q2h; + error = quota2_walk_list(ump, hbp, type, + &q2h->q2h_entries[id & hash_mask], B_MODIFY, &c, + dq2clear_callback); + + bwrite(hbp); + +out_dqlock: + mutex_exit(&dqlock); +out_wapbl: + UFS_WAPBL_END(ump->um_mountp); +out_il: + mutex_exit(&dq->dq_interlock); +out_dq: + dqrele(NULLVP, dq); + return error; +} + +static int +quota2_array_add_q2e(struct ufsmount *ump, int type, + int id, prop_array_t replies) +{ + struct dquot *dq; + int error; + struct quota2_entry *q2ep, q2e; + struct buf *bp; + const int needswap = UFS_MPNEEDSWAP(ump); + prop_dictionary_t dict; + + error = dqget(NULLVP, id, ump, type, &dq); + if (error) + return error; + + mutex_enter(&dq->dq_interlock); + if (dq->dq2_lblkno == 0 && dq->dq2_blkoff == 0) { + mutex_exit(&dq->dq_interlock); + dqrele(NULLVP, dq); + return ENOENT; + } + error = getq2e(ump, type, dq->dq2_lblkno, dq->dq2_blkoff, + &bp, &q2ep, 0); + if (error) { + mutex_exit(&dq->dq_interlock); + dqrele(NULLVP, dq); + return error; + } + quota2_ufs_rwq2e(q2ep, &q2e, needswap); + brelse(bp, 0); + mutex_exit(&dq->dq_interlock); + dqrele(NULLVP, dq); + dict = q2etoprop(&q2e, 0); + if (dict == NULL) + return ENOMEM; + if (!prop_array_add_and_rel(replies, dict)) + return ENOMEM; + return 0; +} + +int +quota2_handle_cmd_get(struct ufsmount *ump, int type, int id, + int defaultq, prop_array_t replies) +{ + int error; + struct quota2_header *q2h; + struct quota2_entry q2e; + struct buf *bp; + prop_dictionary_t dict; + const int needswap = UFS_MPNEEDSWAP(ump); + + if (ump->um_quotas[type] == NULLVP) + return ENODEV; + if (defaultq) { + mutex_enter(&dqlock); + error = getq2h(ump, type, &bp, &q2h, 0); + if (error) { + mutex_exit(&dqlock); + return error; + } + quota2_ufs_rwq2e(&q2h->q2h_defentry, &q2e, needswap); + mutex_exit(&dqlock); + brelse(bp, 0); + dict = q2etoprop(&q2e, defaultq); + if (dict == NULL) + return ENOMEM; + if (!prop_array_add_and_rel(replies, dict)) + return ENOMEM; + } else + error = quota2_array_add_q2e(ump, type, id, replies); + + return error; +} + +struct getuids { + long nuids; /* number of uids in array */ + long size; /* size of array */ + uid_t *uids; /* array of uids, dynamically allocated */ +}; + +static int +quota2_getuids_callback(struct ufsmount *ump, uint64_t *offp, + struct quota2_entry *q2ep, uint64_t off, void *v) +{ + struct getuids *gu = v; + uid_t *newuids; +#ifdef FFS_EI + const int needswap = UFS_MPNEEDSWAP(ump); +#endif + + if (gu->nuids == gu->size) { + newuids = realloc(gu->uids, gu->size + PAGE_SIZE, M_TEMP, + M_WAITOK); + if (newuids == NULL) { + free(gu->uids, M_TEMP); + return ENOMEM; + } + gu->uids = newuids; + gu->size += (PAGE_SIZE / sizeof(uid_t)); + } + gu->uids[gu->nuids] = ufs_rw32(q2ep->q2e_uid, needswap); + gu->nuids++; + return 0; +} + +int +quota2_handle_cmd_getall(struct ufsmount *ump, int type, prop_array_t replies) +{ + int error; + struct quota2_header *q2h; + struct quota2_entry q2e; + struct buf *hbp; + prop_dictionary_t dict; + uint64_t offset; + int i, j; + int quota2_hash_size; + const int needswap = UFS_MPNEEDSWAP(ump); + struct getuids gu; + + if (ump->um_quotas[type] == NULLVP) + return ENODEV; + mutex_enter(&dqlock); + error = getq2h(ump, type, &hbp, &q2h, 0); + if (error) { + mutex_exit(&dqlock); + return error; + } + quota2_ufs_rwq2e(&q2h->q2h_defentry, &q2e, needswap); + dict = q2etoprop(&q2e, 1); + if (!prop_array_add_and_rel(replies, dict)) { + error = ENOMEM; + goto error_bp; + } + /* + * we can't directly get entries as we can't walk the list + * with qdlock and grab dq_interlock to read the entries + * at the same time. So just walk the lists to build a list of uid, + * and then read entries for these uids + */ + memset(&gu, 0, sizeof(gu)); + quota2_hash_size = ufs_rw16(q2h->q2h_hash_size, needswap); + for (i = 0; i < quota2_hash_size ; i++) { + offset = q2h->q2h_entries[i]; + error = quota2_walk_list(ump, hbp, type, &offset, 0, &gu, + quota2_getuids_callback); + if (error) { + if (gu.uids != NULL) + free(gu.uids, M_TEMP); + break; + } + } +error_bp: + mutex_exit(&dqlock); + brelse(hbp, 0); + if (error) + return error; + for (j = 0; j < gu.nuids; j++) { + error = quota2_array_add_q2e(ump, type, + gu.uids[j], replies); + if (error && error != ENOENT) + break; + } + free(gu.uids, M_TEMP); + return error; +} + +int +q2sync(struct mount *mp) +{ + return 0; +} + +struct dq2get_callback { + uid_t id; + struct dquot *dq; +}; + +static int +dq2get_callback(struct ufsmount *ump, uint64_t *offp, struct quota2_entry *q2e, + uint64_t off, void *v) +{ + struct dq2get_callback *c = v; + daddr_t lblkno; + int blkoff; +#ifdef FFS_EI + const int needswap = UFS_MPNEEDSWAP(ump); +#endif + + if (ufs_rw32(q2e->q2e_uid, needswap) == c->id) { + KASSERT(mutex_owned(&c->dq->dq_interlock)); + lblkno = (off >> ump->um_mountp->mnt_fs_bshift); + blkoff = (off & ump->umq2_bmask); + c->dq->dq2_lblkno = lblkno; + c->dq->dq2_blkoff = blkoff; + return Q2WL_ABORT; + } + return 0; +} + +int +dq2get(struct vnode *dqvp, u_long id, struct ufsmount *ump, int type, + struct dquot *dq) +{ + struct buf *bp; + struct quota2_header *q2h; + int error; + daddr_t offset; + u_long hash_mask; + struct dq2get_callback c = { + .id = id, + .dq = dq + }; + + KASSERT(mutex_owned(&dq->dq_interlock)); + mutex_enter(&dqlock); + error = getq2h(ump, type, &bp, &q2h, 0); + if (error) + goto out_mutex; + /* look for our entry */ + hash_mask = ((1 << q2h->q2h_hash_shift) - 1); + offset = q2h->q2h_entries[id & hash_mask]; + error = quota2_walk_list(ump, bp, type, &offset, 0, (void *)&c, + dq2get_callback); + brelse(bp, 0); +out_mutex: + mutex_exit(&dqlock); + return error; +} + +int +dq2sync(struct vnode *vp, struct dquot *dq) +{ + return 0; +} diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c new file mode 100644 index 000000000..4ab40c8c9 --- /dev/null +++ b/sys/ufs/ufs/ufs_readwrite.c @@ -0,0 +1,533 @@ +/* $NetBSD: ufs_readwrite.c,v 1.100 2011/11/18 21:18:52 christos Exp $ */ + +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 + */ + +#include +__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.100 2011/11/18 21:18:52 christos Exp $"); + +#ifdef LFS_READWRITE +#define FS struct lfs +#define I_FS i_lfs +#define READ lfs_read +#define READ_S "lfs_read" +#define WRITE lfs_write +#define WRITE_S "lfs_write" +#define fs_bsize lfs_bsize +#define fs_bmask lfs_bmask +#define UFS_WAPBL_BEGIN(mp) 0 +#define UFS_WAPBL_END(mp) do { } while (0) +#define UFS_WAPBL_UPDATE(vp, access, modify, flags) do { } while (0) +#else +#define FS struct fs +#define I_FS i_fs +#define READ ffs_read +#define READ_S "ffs_read" +#define WRITE ffs_write +#define WRITE_S "ffs_write" +#endif + +/* + * Vnode op for reading. + */ +/* ARGSUSED */ +int +READ(void *v) +{ + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp; + struct inode *ip; + struct uio *uio; + struct ufsmount *ump; + struct buf *bp; + FS *fs; + vsize_t bytelen; + daddr_t lbn, nextlbn; + off_t bytesinfile; + long size, xfersize, blkoffset; + int error, ioflag; + bool usepc = false; + + vp = ap->a_vp; + ip = VTOI(vp); + ump = ip->i_ump; + uio = ap->a_uio; + ioflag = ap->a_ioflag; + error = 0; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ) + panic("%s: mode", READ_S); + + if (vp->v_type == VLNK) { + if (ip->i_size < ump->um_maxsymlinklen || + (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0)) + panic("%s: short symlink", READ_S); + } else if (vp->v_type != VREG && vp->v_type != VDIR) + panic("%s: type %d", READ_S, vp->v_type); +#endif + fs = ip->I_FS; + if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize) + return (EFBIG); + if (uio->uio_resid == 0) + return (0); + +#ifndef LFS_READWRITE + if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT) + return ffs_snapshot_read(vp, uio, ioflag); +#endif /* !LFS_READWRITE */ + + fstrans_start(vp->v_mount, FSTRANS_SHARED); + + if (uio->uio_offset >= ip->i_size) + goto out; + +#ifdef LFS_READWRITE + usepc = (vp->v_type == VREG && ip->i_number != LFS_IFILE_INUM); +#else /* !LFS_READWRITE */ + usepc = vp->v_type == VREG; +#endif /* !LFS_READWRITE */ + if (usepc) { + const int advice = IO_ADV_DECODE(ap->a_ioflag); + + while (uio->uio_resid > 0) { + if (ioflag & IO_DIRECT) { + genfs_directio(vp, uio, ioflag); + } + bytelen = MIN(ip->i_size - uio->uio_offset, + uio->uio_resid); + if (bytelen == 0) + break; + error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice, + UBC_READ | UBC_PARTIALOK | UBC_UNMAP_FLAG(vp)); + if (error) + break; + } + goto out; + } + + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { + bytesinfile = ip->i_size - uio->uio_offset; + if (bytesinfile <= 0) + break; + lbn = lblkno(fs, uio->uio_offset); + nextlbn = lbn + 1; + size = blksize(fs, ip, lbn); + blkoffset = blkoff(fs, uio->uio_offset); + xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), + bytesinfile); + + if (lblktosize(fs, nextlbn) >= ip->i_size) + error = bread(vp, lbn, size, NOCRED, 0, &bp); + else { + int nextsize = blksize(fs, ip, nextlbn); + error = breadn(vp, lbn, + size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp); + } + if (error) + break; + + /* + * We should only get non-zero b_resid when an I/O error + * has occurred, which should cause us to break above. + * However, if the short read did not cause an error, + * then we want to ensure that we do not uiomove bad + * or uninitialized data. + */ + size -= bp->b_resid; + if (size < xfersize) { + if (size == 0) + break; + xfersize = size; + } + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); + if (error) + break; + brelse(bp, 0); + } + if (bp != NULL) + brelse(bp, 0); + + out: + if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) { + ip->i_flag |= IN_ACCESS; + if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) { + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) { + fstrans_done(vp->v_mount); + return error; + } + error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); + UFS_WAPBL_END(vp->v_mount); + } + } + + fstrans_done(vp->v_mount); + return (error); +} + +/* + * Vnode op for writing. + */ +int +WRITE(void *v) +{ + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp; + struct uio *uio; + struct inode *ip; + FS *fs; + struct buf *bp; + kauth_cred_t cred; + daddr_t lbn; + off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize; + int blkoffset, error, flags, ioflag, resid, size, xfersize; + int aflag; + int extended=0; + vsize_t bytelen; + bool async; + bool usepc = false; +#ifdef LFS_READWRITE + bool need_unreserve = false; +#endif + struct ufsmount *ump; + + cred = ap->a_cred; + ioflag = ap->a_ioflag; + uio = ap->a_uio; + vp = ap->a_vp; + ip = VTOI(vp); + ump = ip->i_ump; + + KASSERT(vp->v_size == ip->i_size); +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_WRITE) + panic("%s: mode", WRITE_S); +#endif + + switch (vp->v_type) { + case VREG: + if (ioflag & IO_APPEND) + uio->uio_offset = ip->i_size; + if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) + return (EPERM); + /* FALLTHROUGH */ + case VLNK: + break; + case VDIR: + if ((ioflag & IO_SYNC) == 0) + panic("%s: nonsync dir write", WRITE_S); + break; + default: + panic("%s: type", WRITE_S); + } + + fs = ip->I_FS; + if (uio->uio_offset < 0 || + (u_int64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize) + return (EFBIG); +#ifdef LFS_READWRITE + /* Disallow writes to the Ifile, even if noschg flag is removed */ + /* XXX can this go away when the Ifile is no longer in the namespace? */ + if (vp == fs->lfs_ivnode) + return (EPERM); +#endif + if (uio->uio_resid == 0) + return (0); + + fstrans_start(vp->v_mount, FSTRANS_SHARED); + + flags = ioflag & IO_SYNC ? B_SYNC : 0; + async = vp->v_mount->mnt_flag & MNT_ASYNC; + origoff = uio->uio_offset; + resid = uio->uio_resid; + osize = ip->i_size; + error = 0; + + usepc = vp->v_type == VREG; + + if ((ioflag & IO_JOURNALLOCKED) == 0) { + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) { + fstrans_done(vp->v_mount); + return error; + } + } + +#ifdef LFS_READWRITE + async = true; + lfs_check(vp, LFS_UNUSED_LBN, 0); +#endif /* !LFS_READWRITE */ + if (!usepc) + goto bcache; + + preallocoff = round_page(blkroundup(fs, MAX(osize, uio->uio_offset))); + aflag = ioflag & IO_SYNC ? B_SYNC : 0; + nsize = MAX(osize, uio->uio_offset + uio->uio_resid); + endallocoff = nsize - blkoff(fs, nsize); + + /* + * if we're increasing the file size, deal with expanding + * the fragment if there is one. + */ + + if (nsize > osize && lblkno(fs, osize) < NDADDR && + lblkno(fs, osize) != lblkno(fs, nsize) && + blkroundup(fs, osize) != osize) { + off_t eob; + + eob = blkroundup(fs, osize); + uvm_vnp_setwritesize(vp, eob); + error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag); + if (error) + goto out; + if (flags & B_SYNC) { + mutex_enter(vp->v_interlock); + VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask), + round_page(eob), + PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED); + } + } + + while (uio->uio_resid > 0) { + int ubc_flags = UBC_WRITE; + bool overwrite; /* if we're overwrite a whole block */ + off_t newoff; + + if (ioflag & IO_DIRECT) { + genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED); + } + + oldoff = uio->uio_offset; + blkoffset = blkoff(fs, uio->uio_offset); + bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid); + if (bytelen == 0) { + break; + } + + /* + * if we're filling in a hole, allocate the blocks now and + * initialize the pages first. if we're extending the file, + * we can safely allocate blocks without initializing pages + * since the new blocks will be inaccessible until the write + * is complete. + */ + overwrite = uio->uio_offset >= preallocoff && + uio->uio_offset < endallocoff; + if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 && + blkoff(fs, uio->uio_offset) == 0 && + (uio->uio_offset & PAGE_MASK) == 0) { + vsize_t len; + + len = trunc_page(bytelen); + len -= blkoff(fs, len); + if (len > 0) { + overwrite = true; + bytelen = len; + } + } + + newoff = oldoff + bytelen; + if (vp->v_size < newoff) { + uvm_vnp_setwritesize(vp, newoff); + } + + if (!overwrite) { + error = ufs_balloc_range(vp, uio->uio_offset, bytelen, + cred, aflag); + if (error) + break; + } else { + genfs_node_wrlock(vp); + error = GOP_ALLOC(vp, uio->uio_offset, bytelen, + aflag, cred); + genfs_node_unlock(vp); + if (error) + break; + ubc_flags |= UBC_FAULTBUSY; + } + + /* + * copy the data. + */ + + error = ubc_uiomove(&vp->v_uobj, uio, bytelen, + IO_ADV_DECODE(ioflag), ubc_flags | UBC_UNMAP_FLAG(vp)); + + /* + * update UVM's notion of the size now that we've + * copied the data into the vnode's pages. + * + * we should update the size even when uiomove failed. + */ + + if (vp->v_size < newoff) { + uvm_vnp_setsize(vp, newoff); + extended = 1; + } + + if (error) + break; + + /* + * flush what we just wrote if necessary. + * XXXUBC simplistic async flushing. + */ + +#ifndef LFS_READWRITE + if (!async && oldoff >> 16 != uio->uio_offset >> 16) { + mutex_enter(vp->v_interlock); + error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16, + (uio->uio_offset >> 16) << 16, + PGO_CLEANIT | PGO_JOURNALLOCKED); + if (error) + break; + } +#endif + } + if (error == 0 && ioflag & IO_SYNC) { + mutex_enter(vp->v_interlock); + error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask), + round_page(blkroundup(fs, uio->uio_offset)), + PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED); + } + goto out; + + bcache: + mutex_enter(vp->v_interlock); + VOP_PUTPAGES(vp, trunc_page(origoff), round_page(origoff + resid), + PGO_CLEANIT | PGO_FREE | PGO_SYNCIO | PGO_JOURNALLOCKED); + while (uio->uio_resid > 0) { + lbn = lblkno(fs, uio->uio_offset); + blkoffset = blkoff(fs, uio->uio_offset); + xfersize = MIN(fs->fs_bsize - blkoffset, uio->uio_resid); + if (fs->fs_bsize > xfersize) + flags |= B_CLRBUF; + else + flags &= ~B_CLRBUF; + +#ifdef LFS_READWRITE + error = lfs_reserve(fs, vp, NULL, + btofsb(fs, (NIADDR + 1) << fs->lfs_bshift)); + if (error) + break; + need_unreserve = true; +#endif + error = UFS_BALLOC(vp, uio->uio_offset, xfersize, + ap->a_cred, flags, &bp); + + if (error) + break; + if (uio->uio_offset + xfersize > ip->i_size) { + ip->i_size = uio->uio_offset + xfersize; + DIP_ASSIGN(ip, size, ip->i_size); + uvm_vnp_setsize(vp, ip->i_size); + extended = 1; + } + size = blksize(fs, ip, lbn) - bp->b_resid; + if (xfersize > size) + xfersize = size; + + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); + + /* + * if we didn't clear the block and the uiomove failed, + * the buf will now contain part of some other file, + * so we need to invalidate it. + */ + if (error && (flags & B_CLRBUF) == 0) { + brelse(bp, BC_INVAL); + break; + } +#ifdef LFS_READWRITE + (void)VOP_BWRITE(bp->b_vp, bp); + lfs_reserve(fs, vp, NULL, + -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift)); + need_unreserve = false; +#else + if (ioflag & IO_SYNC) + (void)bwrite(bp); + else if (xfersize + blkoffset == fs->fs_bsize) + bawrite(bp); + else + bdwrite(bp); +#endif + if (error || xfersize == 0) + break; + } +#ifdef LFS_READWRITE + if (need_unreserve) { + lfs_reserve(fs, vp, NULL, + -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift)); + } +#endif + + /* + * If we successfully wrote any data, and we are not the superuser + * we clear the setuid and setgid bits as a precaution against + * tampering. + */ +out: + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (vp->v_mount->mnt_flag & MNT_RELATIME) + ip->i_flag |= IN_ACCESS; + if (resid > uio->uio_resid && ap->a_cred && + kauth_authorize_generic(ap->a_cred, KAUTH_GENERIC_ISSUSER, NULL)) { + ip->i_mode &= ~(ISUID | ISGID); + DIP_ASSIGN(ip, mode, ip->i_mode); + } + if (resid > uio->uio_resid) + VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); + if (error) { + (void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred); + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC) + error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); + else + UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); + KASSERT(vp->v_size == ip->i_size); + if ((ioflag & IO_JOURNALLOCKED) == 0) + UFS_WAPBL_END(vp->v_mount); + fstrans_done(vp->v_mount); + + return (error); +} diff --git a/sys/ufs/ufs/ufs_vfsops.c b/sys/ufs/ufs/ufs_vfsops.c new file mode 100644 index 000000000..ac7230bca --- /dev/null +++ b/sys/ufs/ufs/ufs_vfsops.c @@ -0,0 +1,308 @@ +/* $NetBSD: ufs_vfsops.c,v 1.42 2011/03/24 17:05:46 bouyer Exp $ */ + +/* + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_vfsops.c 8.8 (Berkeley) 5/20/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ufs_vfsops.c,v 1.42 2011/03/24 17:05:46 bouyer Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_ffs.h" +#include "opt_quota.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#ifdef UFS_DIRHASH +#include +#endif +#include + +/* how many times ufs_init() was called */ +static int ufs_initcount = 0; + +pool_cache_t ufs_direct_cache; + +/* + * Make a filesystem operational. + * Nothing to do at the moment. + */ +/* ARGSUSED */ +int +ufs_start(struct mount *mp, int flags) +{ + + return (0); +} + +/* + * Return the root of a filesystem. + */ +int +ufs_root(struct mount *mp, struct vnode **vpp) +{ + struct vnode *nvp; + int error; + + if ((error = VFS_VGET(mp, (ino_t)ROOTINO, &nvp)) != 0) + return (error); + *vpp = nvp; + return (0); +} + +/* + * Do operations associated with quotas + */ +int +ufs_quotactl(struct mount *mp, prop_dictionary_t dict) +{ + struct lwp *l = curlwp; + +#if !defined(QUOTA) && !defined(QUOTA2) + (void) mp; + (void) dict; + (void) l; + return (EOPNOTSUPP); +#else + int error; + prop_dictionary_t cmddict; + prop_array_t commands; + prop_object_iterator_t iter; + + /* Mark the mount busy, as we're passing it to kauth(9). */ + error = vfs_busy(mp, NULL); + if (error) + return (error); + + error = quota_get_cmds(dict, &commands); + if (error) + goto out_vfs; + iter = prop_array_iterator(commands); + if (iter == NULL) { + error = ENOMEM; + goto out_vfs; + } + + + mutex_enter(&mp->mnt_updating); + while ((cmddict = prop_object_iterator_next(iter)) != NULL) { + if (prop_object_type(cmddict) != PROP_TYPE_DICTIONARY) + continue; + error = quota_handle_cmd(mp, l, cmddict); + if (error) + break; + } + prop_object_iterator_release(iter); + mutex_exit(&mp->mnt_updating); +out_vfs: + vfs_unbusy(mp, false, NULL); + return (error); +#endif +} + +#if 0 + switch (cmd) { + case Q_SYNC: + break; + + case Q_GETQUOTA: + /* The user can always query about his own quota. */ + if (uid == kauth_cred_getuid(l->l_cred)) + break; + + error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, + KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, KAUTH_ARG(uid), NULL); + + break; + + case Q_QUOTAON: + case Q_QUOTAOFF: + error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, + KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL); + + break; + + case Q_SETQUOTA: + case Q_SETUSE: + error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA, + KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(uid), NULL); + + break; + + default: + error = EINVAL; + break; + } + + type = cmds & SUBCMDMASK; + if (!error) { + /* Only check if there was no error above. */ + if ((u_int)type >= MAXQUOTAS) + error = EINVAL; + } + + if (error) { + vfs_unbusy(mp, false, NULL); + return (error); + } + + mutex_enter(&mp->mnt_updating); + switch (cmd) { + + case Q_QUOTAON: + error = quotaon(l, mp, type, arg); + break; + + case Q_QUOTAOFF: + error = quotaoff(l, mp, type); + break; + + case Q_SETQUOTA: + error = setquota(mp, uid, type, arg); + break; + + case Q_SETUSE: + error = setuse(mp, uid, type, arg); + break; + + case Q_GETQUOTA: + error = getquota(mp, uid, type, arg); + break; + + case Q_SYNC: + error = qsync(mp); + break; + + default: + error = EINVAL; + } + mutex_exit(&mp->mnt_updating); + vfs_unbusy(mp, false, NULL); + return (error); +#endif + +/* + * This is the generic part of fhtovp called after the underlying + * filesystem has validated the file handle. + */ +int +ufs_fhtovp(struct mount *mp, struct ufid *ufhp, struct vnode **vpp) +{ + struct vnode *nvp; + struct inode *ip; + int error; + + if ((error = VFS_VGET(mp, ufhp->ufid_ino, &nvp)) != 0) { + *vpp = NULLVP; + return (error); + } + ip = VTOI(nvp); + if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen) { + vput(nvp); + *vpp = NULLVP; + return (ESTALE); + } + *vpp = nvp; + return (0); +} + +/* + * Initialize UFS filesystems, done only once. + */ +void +ufs_init(void) +{ + if (ufs_initcount++ > 0) + return; + + ufs_direct_cache = pool_cache_init(sizeof(struct direct), 0, 0, 0, + "ufsdir", NULL, IPL_NONE, NULL, NULL, NULL); + + ufs_ihashinit(); +#if defined(QUOTA) || defined(QUOTA2) + dqinit(); +#endif +#ifdef UFS_DIRHASH + ufsdirhash_init(); +#endif +#ifdef UFS_EXTATTR + ufs_extattr_init(); +#endif +} + +void +ufs_reinit(void) +{ + ufs_ihashreinit(); +#if defined(QUOTA) || defined(QUOTA2) + dqreinit(); +#endif +} + +/* + * Free UFS filesystem resources, done only once. + */ +void +ufs_done(void) +{ + if (--ufs_initcount > 0) + return; + + ufs_ihashdone(); +#if defined(QUOTA) || defined(QUOTA2) + dqdone(); +#endif + pool_cache_destroy(ufs_direct_cache); +#ifdef UFS_DIRHASH + ufsdirhash_done(); +#endif +#ifdef UFS_EXTATTR + ufs_extattr_done(); +#endif +} diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c new file mode 100644 index 000000000..634f96694 --- /dev/null +++ b/sys/ufs/ufs/ufs_vnops.c @@ -0,0 +1,2989 @@ +/* $NetBSD: ufs_vnops.c,v 1.206 2011/11/18 21:18:52 christos Exp $ */ + +/*- + * Copyright (c) 2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1982, 1986, 1989, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_vnops.c 8.28 (Berkeley) 7/31/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.206 2011/11/18 21:18:52 christos Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_ffs.h" +#include "opt_quota.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef UFS_DIRHASH +#include +#endif +#include +#include +#include +#include +#include + +#include + +__CTASSERT(EXT2FS_MAXNAMLEN == FFS_MAXNAMLEN); +__CTASSERT(LFS_MAXNAMLEN == FFS_MAXNAMLEN); + +static int ufs_chmod(struct vnode *, int, kauth_cred_t, struct lwp *); +static int ufs_chown(struct vnode *, uid_t, gid_t, kauth_cred_t, + struct lwp *); + +/* + * A virgin directory (no blushing please). + */ +static const struct dirtemplate mastertemplate = { + 0, 12, DT_DIR, 1, ".", + 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." +}; + +/* + * Create a regular file + */ +int +ufs_create(void *v) +{ + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap = v; + int error; + struct vnode *dvp = ap->a_dvp; + struct ufs_lookup_results *ulr; + + /* XXX should handle this material another way */ + ulr = &VTOI(dvp)->i_crap; + UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); + + /* + * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful + * ufs_makeinode + */ + fstrans_start(dvp->v_mount, FSTRANS_SHARED); + error = + ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), + dvp, ulr, ap->a_vpp, ap->a_cnp); + if (error) { + fstrans_done(dvp->v_mount); + return (error); + } + UFS_WAPBL_END1(dvp->v_mount, dvp); + fstrans_done(dvp->v_mount); + VN_KNOTE(dvp, NOTE_WRITE); + return (0); +} + +/* + * Mknod vnode call + */ +/* ARGSUSED */ +int +ufs_mknod(void *v) +{ + struct vop_mknod_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap = v; + struct vattr *vap; + struct vnode **vpp; + struct inode *ip; + int error; + struct mount *mp; + ino_t ino; + struct ufs_lookup_results *ulr; + + vap = ap->a_vap; + vpp = ap->a_vpp; + + /* XXX should handle this material another way */ + ulr = &VTOI(ap->a_dvp)->i_crap; + UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp)); + + /* + * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful + * ufs_makeinode + */ + fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED); + if ((error = + ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), + ap->a_dvp, ulr, vpp, ap->a_cnp)) != 0) + goto out; + VN_KNOTE(ap->a_dvp, NOTE_WRITE); + ip = VTOI(*vpp); + mp = (*vpp)->v_mount; + ino = ip->i_number; + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + if (vap->va_rdev != VNOVAL) { + struct ufsmount *ump = ip->i_ump; + /* + * Want to be able to use this to make badblock + * inodes, so don't truncate the dev number. + */ + if (ump->um_fstype == UFS1) + ip->i_ffs1_rdev = ufs_rw32(vap->va_rdev, + UFS_MPNEEDSWAP(ump)); + else + ip->i_ffs2_rdev = ufs_rw64(vap->va_rdev, + UFS_MPNEEDSWAP(ump)); + } + UFS_WAPBL_UPDATE(*vpp, NULL, NULL, 0); + UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp); + /* + * Remove inode so that it will be reloaded by VFS_VGET and + * checked to see if it is an alias of an existing entry in + * the inode cache. + */ + (*vpp)->v_type = VNON; + VOP_UNLOCK(*vpp); + vgone(*vpp); + error = VFS_VGET(mp, ino, vpp); +out: + fstrans_done(ap->a_dvp->v_mount); + if (error != 0) { + *vpp = NULL; + return (error); + } + return (0); +} + +/* + * Open called. + * + * Nothing to do. + */ +/* ARGSUSED */ +int +ufs_open(void *v) +{ + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + kauth_cred_t a_cred; + } */ *ap = v; + + /* + * Files marked append-only must be opened for appending. + */ + if ((VTOI(ap->a_vp)->i_flags & APPEND) && + (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) + return (EPERM); + return (0); +} + +/* + * Close called. + * + * Update the times on the inode. + */ +/* ARGSUSED */ +int +ufs_close(void *v) +{ + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp; + struct inode *ip; + + vp = ap->a_vp; + ip = VTOI(vp); + fstrans_start(vp->v_mount, FSTRANS_SHARED); + if (vp->v_usecount > 1) + UFS_ITIMES(vp, NULL, NULL, NULL); + fstrans_done(vp->v_mount); + return (0); +} + +static int +ufs_check_possible(struct vnode *vp, struct inode *ip, mode_t mode, + kauth_cred_t cred) +{ +#if defined(QUOTA) || defined(QUOTA2) + int error; +#endif + + /* + * Disallow write attempts on read-only file systems; + * unless the file is a socket, fifo, or a block or + * character device resident on the file system. + */ + if (mode & VWRITE) { + switch (vp->v_type) { + case VDIR: + case VLNK: + case VREG: + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); +#if defined(QUOTA) || defined(QUOTA2) + fstrans_start(vp->v_mount, FSTRANS_SHARED); + error = chkdq(ip, 0, cred, 0); + fstrans_done(vp->v_mount); + if (error != 0) + return error; +#endif + break; + case VBAD: + case VBLK: + case VCHR: + case VSOCK: + case VFIFO: + case VNON: + default: + break; + } + } + + /* If it is a snapshot, nobody gets access to it. */ + if ((ip->i_flags & SF_SNAPSHOT)) + return (EPERM); + /* If immutable bit set, nobody gets to write it. */ + if ((mode & VWRITE) && (ip->i_flags & IMMUTABLE)) + return (EPERM); + + return 0; +} + +static int +ufs_check_permitted(struct vnode *vp, struct inode *ip, mode_t mode, + kauth_cred_t cred) +{ + + return genfs_can_access(vp->v_type, ip->i_mode & ALLPERMS, ip->i_uid, + ip->i_gid, mode, cred); +} + +int +ufs_access(void *v) +{ + struct vop_access_args /* { + struct vnode *a_vp; + int a_mode; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp; + struct inode *ip; + mode_t mode; + int error; + + vp = ap->a_vp; + ip = VTOI(vp); + mode = ap->a_mode; + + error = ufs_check_possible(vp, ip, mode, ap->a_cred); + if (error) + return error; + + error = ufs_check_permitted(vp, ip, mode, ap->a_cred); + + return error; +} + +/* ARGSUSED */ +int +ufs_getattr(void *v) +{ + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp; + struct inode *ip; + struct vattr *vap; + + vp = ap->a_vp; + ip = VTOI(vp); + vap = ap->a_vap; + fstrans_start(vp->v_mount, FSTRANS_SHARED); + UFS_ITIMES(vp, NULL, NULL, NULL); + + /* + * Copy from inode table + */ + vap->va_fsid = ip->i_dev; + vap->va_fileid = ip->i_number; + vap->va_mode = ip->i_mode & ALLPERMS; + vap->va_nlink = ip->i_nlink; + vap->va_uid = ip->i_uid; + vap->va_gid = ip->i_gid; + vap->va_size = vp->v_size; + if (ip->i_ump->um_fstype == UFS1) { + vap->va_rdev = (dev_t)ufs_rw32(ip->i_ffs1_rdev, + UFS_MPNEEDSWAP(ip->i_ump)); + vap->va_atime.tv_sec = ip->i_ffs1_atime; + vap->va_atime.tv_nsec = ip->i_ffs1_atimensec; + vap->va_mtime.tv_sec = ip->i_ffs1_mtime; + vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec; + vap->va_ctime.tv_sec = ip->i_ffs1_ctime; + vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec; + vap->va_birthtime.tv_sec = 0; + vap->va_birthtime.tv_nsec = 0; + vap->va_bytes = dbtob((u_quad_t)ip->i_ffs1_blocks); + } else { + vap->va_rdev = (dev_t)ufs_rw64(ip->i_ffs2_rdev, + UFS_MPNEEDSWAP(ip->i_ump)); + vap->va_atime.tv_sec = ip->i_ffs2_atime; + vap->va_atime.tv_nsec = ip->i_ffs2_atimensec; + vap->va_mtime.tv_sec = ip->i_ffs2_mtime; + vap->va_mtime.tv_nsec = ip->i_ffs2_mtimensec; + vap->va_ctime.tv_sec = ip->i_ffs2_ctime; + vap->va_ctime.tv_nsec = ip->i_ffs2_ctimensec; + vap->va_birthtime.tv_sec = ip->i_ffs2_birthtime; + vap->va_birthtime.tv_nsec = ip->i_ffs2_birthnsec; + vap->va_bytes = dbtob(ip->i_ffs2_blocks); + } + vap->va_gen = ip->i_gen; + vap->va_flags = ip->i_flags; + + /* this doesn't belong here */ + if (vp->v_type == VBLK) + vap->va_blocksize = BLKDEV_IOSIZE; + else if (vp->v_type == VCHR) + vap->va_blocksize = MAXBSIZE; + else + vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; + vap->va_type = vp->v_type; + vap->va_filerev = ip->i_modrev; + fstrans_done(vp->v_mount); + return (0); +} + +/* + * Set attribute vnode op. called from several syscalls + */ +int +ufs_setattr(void *v) +{ + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + kauth_cred_t a_cred; + } */ *ap = v; + struct vattr *vap; + struct vnode *vp; + struct inode *ip; + kauth_cred_t cred; + struct lwp *l; + int error; + + vap = ap->a_vap; + vp = ap->a_vp; + ip = VTOI(vp); + cred = ap->a_cred; + l = curlwp; + + /* + * Check for unsettable attributes. + */ + if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || + (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || + (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || + ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { + return (EINVAL); + } + + fstrans_start(vp->v_mount, FSTRANS_SHARED); + + if (vap->va_flags != VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) { + error = EROFS; + goto out; + } + if (kauth_cred_geteuid(cred) != ip->i_uid && + (error = kauth_authorize_generic(cred, + KAUTH_GENERIC_ISSUSER, NULL))) + goto out; + if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, + NULL) == 0) { + if ((ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) && + kauth_authorize_system(l->l_cred, + KAUTH_SYSTEM_CHSYSFLAGS, 0, NULL, NULL, NULL)) { + error = EPERM; + goto out; + } + /* Snapshot flag cannot be set or cleared */ + if ((vap->va_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) != + (ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))) { + error = EPERM; + goto out; + } + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; + ip->i_flags = vap->va_flags; + DIP_ASSIGN(ip, flags, ip->i_flags); + } else { + if ((ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) || + (vap->va_flags & UF_SETTABLE) != vap->va_flags) { + error = EPERM; + goto out; + } + if ((ip->i_flags & SF_SETTABLE) != + (vap->va_flags & SF_SETTABLE)) { + error = EPERM; + goto out; + } + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; + ip->i_flags &= SF_SETTABLE; + ip->i_flags |= (vap->va_flags & UF_SETTABLE); + DIP_ASSIGN(ip, flags, ip->i_flags); + } + ip->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); + UFS_WAPBL_END(vp->v_mount); + if (vap->va_flags & (IMMUTABLE | APPEND)) { + error = 0; + goto out; + } + } + if (ip->i_flags & (IMMUTABLE | APPEND)) { + error = EPERM; + goto out; + } + /* + * Go through the fields and update iff not VNOVAL. + */ + if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) { + error = EROFS; + goto out; + } + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; + error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, l); + UFS_WAPBL_END(vp->v_mount); + if (error) + goto out; + } + if (vap->va_size != VNOVAL) { + /* + * Disallow write attempts on read-only file systems; + * unless the file is a socket, fifo, or a block or + * character device resident on the file system. + */ + switch (vp->v_type) { + case VDIR: + error = EISDIR; + goto out; + case VCHR: + case VBLK: + case VFIFO: + break; + case VREG: + if (vp->v_mount->mnt_flag & MNT_RDONLY) { + error = EROFS; + goto out; + } + if ((ip->i_flags & SF_SNAPSHOT) != 0) { + error = EPERM; + goto out; + } + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; + /* + * When journaling, only truncate one indirect block + * at a time. + */ + if (vp->v_mount->mnt_wapbl) { + uint64_t incr = MNINDIR(ip->i_ump) << + vp->v_mount->mnt_fs_bshift; /* Power of 2 */ + uint64_t base = NDADDR << + vp->v_mount->mnt_fs_bshift; + while (!error && ip->i_size > base + incr && + ip->i_size > vap->va_size + incr) { + /* + * round down to next full indirect + * block boundary. + */ + uint64_t nsize = base + + ((ip->i_size - base - 1) & + ~(incr - 1)); + error = UFS_TRUNCATE(vp, nsize, 0, + cred); + if (error == 0) { + UFS_WAPBL_END(vp->v_mount); + error = + UFS_WAPBL_BEGIN(vp->v_mount); + } + } + } + if (!error) + error = UFS_TRUNCATE(vp, vap->va_size, 0, cred); + UFS_WAPBL_END(vp->v_mount); + if (error) + goto out; + break; + default: + error = EOPNOTSUPP; + goto out; + } + } + ip = VTOI(vp); + if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || + vap->va_birthtime.tv_sec != VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) { + error = EROFS; + goto out; + } + if ((ip->i_flags & SF_SNAPSHOT) != 0) { + error = EPERM; + goto out; + } + error = genfs_can_chtimes(vp, vap->va_vaflags, ip->i_uid, cred); + if (error) + goto out; + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; + if (vap->va_atime.tv_sec != VNOVAL) + if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) + ip->i_flag |= IN_ACCESS; + if (vap->va_mtime.tv_sec != VNOVAL) { + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (vp->v_mount->mnt_flag & MNT_RELATIME) + ip->i_flag |= IN_ACCESS; + } + if (vap->va_birthtime.tv_sec != VNOVAL && + ip->i_ump->um_fstype == UFS2) { + ip->i_ffs2_birthtime = vap->va_birthtime.tv_sec; + ip->i_ffs2_birthnsec = vap->va_birthtime.tv_nsec; + } + error = UFS_UPDATE(vp, &vap->va_atime, &vap->va_mtime, 0); + UFS_WAPBL_END(vp->v_mount); + if (error) + goto out; + } + error = 0; + if (vap->va_mode != (mode_t)VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) { + error = EROFS; + goto out; + } + if ((ip->i_flags & SF_SNAPSHOT) != 0 && + (vap->va_mode & (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | + S_IXOTH | S_IWOTH))) { + error = EPERM; + goto out; + } + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; + error = ufs_chmod(vp, (int)vap->va_mode, cred, l); + UFS_WAPBL_END(vp->v_mount); + } + VN_KNOTE(vp, NOTE_ATTRIB); +out: + fstrans_done(vp->v_mount); + return (error); +} + +/* + * Change the mode on a file. + * Inode must be locked before calling. + */ +static int +ufs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l) +{ + struct inode *ip; + int error; + + UFS_WAPBL_JLOCK_ASSERT(vp->v_mount); + + ip = VTOI(vp); + + error = genfs_can_chmod(vp, cred, ip->i_uid, ip->i_gid, mode); + if (error) + return (error); + + fstrans_start(vp->v_mount, FSTRANS_SHARED); + ip->i_mode &= ~ALLPERMS; + ip->i_mode |= (mode & ALLPERMS); + ip->i_flag |= IN_CHANGE; + DIP_ASSIGN(ip, mode, ip->i_mode); + UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); + fstrans_done(vp->v_mount); + return (0); +} + +/* + * Perform chown operation on inode ip; + * inode must be locked prior to call. + */ +static int +ufs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, + struct lwp *l) +{ + struct inode *ip; + int error = 0; +#if defined(QUOTA) || defined(QUOTA2) + uid_t ouid; + gid_t ogid; + int64_t change; +#endif + ip = VTOI(vp); + error = 0; + + if (uid == (uid_t)VNOVAL) + uid = ip->i_uid; + if (gid == (gid_t)VNOVAL) + gid = ip->i_gid; + + error = genfs_can_chown(vp, cred, ip->i_uid, ip->i_gid, uid, gid); + if (error) + return (error); + + fstrans_start(vp->v_mount, FSTRANS_SHARED); +#if defined(QUOTA) || defined(QUOTA2) + ogid = ip->i_gid; + ouid = ip->i_uid; + change = DIP(ip, blocks); + (void) chkdq(ip, -change, cred, 0); + (void) chkiq(ip, -1, cred, 0); +#endif + ip->i_gid = gid; + DIP_ASSIGN(ip, gid, gid); + ip->i_uid = uid; + DIP_ASSIGN(ip, uid, uid); +#if defined(QUOTA) || defined(QUOTA2) + if ((error = chkdq(ip, change, cred, 0)) == 0) { + if ((error = chkiq(ip, 1, cred, 0)) == 0) + goto good; + else + (void) chkdq(ip, -change, cred, FORCE); + } + ip->i_gid = ogid; + DIP_ASSIGN(ip, gid, ogid); + ip->i_uid = ouid; + DIP_ASSIGN(ip, uid, ouid); + (void) chkdq(ip, change, cred, FORCE); + (void) chkiq(ip, 1, cred, FORCE); + fstrans_done(vp->v_mount); + return (error); + good: +#endif /* QUOTA || QUOTA2 */ + ip->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); + fstrans_done(vp->v_mount); + return (0); +} + +int +ufs_remove(void *v) +{ + struct vop_remove_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap = v; + struct vnode *vp, *dvp; + struct inode *ip; + int error; + struct ufs_lookup_results *ulr; + + vp = ap->a_vp; + dvp = ap->a_dvp; + ip = VTOI(vp); + + /* XXX should handle this material another way */ + ulr = &VTOI(dvp)->i_crap; + UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); + + fstrans_start(dvp->v_mount, FSTRANS_SHARED); + if (vp->v_type == VDIR || (ip->i_flags & (IMMUTABLE | APPEND)) || + (VTOI(dvp)->i_flags & APPEND)) + error = EPERM; + else { + error = UFS_WAPBL_BEGIN(dvp->v_mount); + if (error == 0) { + error = ufs_dirremove(dvp, ulr, + ip, ap->a_cnp->cn_flags, 0); + UFS_WAPBL_END(dvp->v_mount); + } + } + VN_KNOTE(vp, NOTE_DELETE); + VN_KNOTE(dvp, NOTE_WRITE); + if (dvp == vp) + vrele(vp); + else + vput(vp); + vput(dvp); + fstrans_done(dvp->v_mount); + return (error); +} + +/* + * ufs_link: create hard link. + */ +int +ufs_link(void *v) +{ + struct vop_link_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap = v; + struct vnode *dvp = ap->a_dvp; + struct vnode *vp = ap->a_vp; + struct componentname *cnp = ap->a_cnp; + struct inode *ip; + struct direct *newdir; + int error; + struct ufs_lookup_results *ulr; + + KASSERT(dvp != vp); + KASSERT(vp->v_type != VDIR); + KASSERT(dvp->v_mount == vp->v_mount); + + /* XXX should handle this material another way */ + ulr = &VTOI(dvp)->i_crap; + UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); + + fstrans_start(dvp->v_mount, FSTRANS_SHARED); + error = vn_lock(vp, LK_EXCLUSIVE); + if (error) { + VOP_ABORTOP(dvp, cnp); + goto out2; + } + ip = VTOI(vp); + if ((nlink_t)ip->i_nlink >= LINK_MAX) { + VOP_ABORTOP(dvp, cnp); + error = EMLINK; + goto out1; + } + if (ip->i_flags & (IMMUTABLE | APPEND)) { + VOP_ABORTOP(dvp, cnp); + error = EPERM; + goto out1; + } + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) { + VOP_ABORTOP(dvp, cnp); + goto out1; + } + ip->i_nlink++; + DIP_ASSIGN(ip, nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + error = UFS_UPDATE(vp, NULL, NULL, UPDATE_DIROP); + if (!error) { + newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); + ufs_makedirentry(ip, cnp, newdir); + error = ufs_direnter(dvp, ulr, vp, newdir, cnp, NULL); + pool_cache_put(ufs_direct_cache, newdir); + } + if (error) { + ip->i_nlink--; + DIP_ASSIGN(ip, nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(vp, NULL, NULL, UPDATE_DIROP); + } + UFS_WAPBL_END(vp->v_mount); + out1: + VOP_UNLOCK(vp); + out2: + VN_KNOTE(vp, NOTE_LINK); + VN_KNOTE(dvp, NOTE_WRITE); + vput(dvp); + fstrans_done(dvp->v_mount); + return (error); +} + +/* + * whiteout vnode call + */ +int +ufs_whiteout(void *v) +{ + struct vop_whiteout_args /* { + struct vnode *a_dvp; + struct componentname *a_cnp; + int a_flags; + } */ *ap = v; + struct vnode *dvp = ap->a_dvp; + struct componentname *cnp = ap->a_cnp; + struct direct *newdir; + int error; + struct ufsmount *ump = VFSTOUFS(dvp->v_mount); + struct ufs_lookup_results *ulr; + + /* XXX should handle this material another way */ + ulr = &VTOI(dvp)->i_crap; + UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); + + error = 0; + switch (ap->a_flags) { + case LOOKUP: + /* 4.4 format directories support whiteout operations */ + if (ump->um_maxsymlinklen > 0) + return (0); + return (EOPNOTSUPP); + + case CREATE: + /* create a new directory whiteout */ + fstrans_start(dvp->v_mount, FSTRANS_SHARED); + error = UFS_WAPBL_BEGIN(dvp->v_mount); + if (error) + break; +#ifdef DIAGNOSTIC + if (ump->um_maxsymlinklen <= 0) + panic("ufs_whiteout: old format filesystem"); +#endif + + newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); + newdir->d_ino = WINO; + newdir->d_namlen = cnp->cn_namelen; + memcpy(newdir->d_name, cnp->cn_nameptr, + (size_t)cnp->cn_namelen); + newdir->d_name[cnp->cn_namelen] = '\0'; + newdir->d_type = DT_WHT; + error = ufs_direnter(dvp, ulr, NULL, newdir, cnp, NULL); + pool_cache_put(ufs_direct_cache, newdir); + break; + + case DELETE: + /* remove an existing directory whiteout */ + fstrans_start(dvp->v_mount, FSTRANS_SHARED); + error = UFS_WAPBL_BEGIN(dvp->v_mount); + if (error) + break; +#ifdef DIAGNOSTIC + if (ump->um_maxsymlinklen <= 0) + panic("ufs_whiteout: old format filesystem"); +#endif + + cnp->cn_flags &= ~DOWHITEOUT; + error = ufs_dirremove(dvp, ulr, NULL, cnp->cn_flags, 0); + break; + default: + panic("ufs_whiteout: unknown op"); + /* NOTREACHED */ + } + UFS_WAPBL_END(dvp->v_mount); + fstrans_done(dvp->v_mount); + return (error); +} + + +/* + * Rename vnode operation + * rename("foo", "bar"); + * is essentially + * unlink("bar"); + * link("foo", "bar"); + * unlink("foo"); + * but ``atomically''. Can't do full commit without saving state in the + * inode on disk which isn't feasible at this time. Best we can do is + * always guarantee the target exists. + * + * Basic algorithm is: + * + * 1) Bump link count on source while we're linking it to the + * target. This also ensure the inode won't be deleted out + * from underneath us while we work (it may be truncated by + * a concurrent `trunc' or `open' for creation). + * 2) Link source to destination. If destination already exists, + * delete it first. + * 3) Unlink source reference to inode if still around. If a + * directory was moved and the parent of the destination + * is different from the source, patch the ".." entry in the + * directory. + */ + +/* + * Notes on rename locking: + * + * We lock parent vnodes before child vnodes. This means in particular + * that if A is above B in the directory tree then A must be locked + * before B. (This is true regardless of how many steps appear in + * between, because an arbitrary number of other processes could lock + * parent/child in between and establish a lock cycle and deadlock.) + * + * Therefore, if tdvp is above fdvp we must lock tdvp first; if fdvp + * is above tdvp we must lock fdvp first; and if they're + * incommensurate it doesn't matter. (But, we rely on the fact that + * there's a whole-volume rename lock to prevent deadlock among groups + * of renames upon overlapping sets of incommensurate vnodes.) + * + * In addition to establishing lock ordering the parent check also + * serves to rule out cases where someone tries to move a directory + * underneath itself, e.g. rename("a/b", "a/b/c"). If allowed to + * proceed such renames would detach portions of the directory tree + * and make fsck very unhappy. + * + * Note that it is an error for *fvp* to be above tdvp; however, + * *fdvp* can be above tdvp, as in rename("a/b", "a/c/d"). + * + * The parent check searches up the tree from tdvp until it either + * finds fdvp or the root of the volume. It also returns the vnode it + * saw immediately before fdvp, if any. Later on (after looking up + * fvp) we will check to see if this *is* fvp and if so fail. + * + * If the parent check finds fdvp, it means fdvp is above tdvp, so we + * lock fdvp first and then tdvp. Otherwise, either tdvp is above fdvp + * or they're incommensurate and we lock tdvp first. + * + * In either case each of the child vnodes has to be looked up and + * locked immediately after its parent. The cases + * + * fdvp/fvp/[.../]tdvp/tvp + * tdvp/tvp/[.../]fdvp/fvp + * + * can cause deadlock otherwise. Note that both of these are error + * cases; the first fails the parent check and the second fails + * because tvp isn't empty. The parent check case is handled before + * we start locking; however, the nonempty case requires locking tvp + * to find out safely that it's nonempty. + * + * Therefore the procedure is either + * + * lock fdvp + * lookup fvp + * lock fvp + * lock tdvp + * lookup tvp + * lock tvp + * + * or + * + * lock tdvp + * lookup tvp + * lock tvp + * lock fdvp + * lookup fvp + * lock fvp + * + * This could in principle be simplified by always looking up fvp + * last; because of the parent check we know by the time we start + * locking that fvp cannot be directly above tdvp, so (given the + * whole-volume rename lock and other assumptions) it's safe to lock + * tdvp before fvp. This would allow the following scheme: + * + * lock fdvp + * lock tdvp + * or + * lock tdvp + * lock fdvp + * + * then + * lookup tvp + * lock tvp + * lookup fvp + * check if fvp is above of tdvp, fail if so + * lock fvp + * + * which is much, much simpler. + * + * However, current levels of vfs namei/lookup sanity do not permit + * this. It is impossible currently to look up fvp without locking it. + * (It gets locked regardless of whether LOCKLEAF is set; without + * LOCKLEAF it just gets unlocked again, which doesn't help.) + * + * Therefore, because we must look up fvp to know if it's above tdvp, + * which locks fvp, we must, at least in the case where fdvp is above + * tdvp, do that before locking tdvp. The longer scheme does that; the + * simpler scheme is not safe. + * + * Note that for now we aren't doing lookup() but relookup(); however, + * the differences are minor. + * + * On top of all the above, just to make everything more + * exciting, any two of the vnodes might end up being the same. + * + * FROMPARENT == FROMCHILD mv a/. foo is an error. + * FROMPARENT == TOPARENT mv a/b a/c is ok. + * FROMPARENT == TOCHILD mv a/b/c a/b will give ENOTEMPTY. + * FROMCHILD == TOPARENT mv a/b a/b/c fails the parent check. + * FROMCHILD == TOCHILD mv a/b a/b is ok. + * TOPARENT == TOCHILD mv foo a/. is an error. + * + * This introduces more cases in the locking, because each distinct + * vnode must be locked exactly once. + * + * When FROMPARENT == TOPARENT and FROMCHILD != TOCHILD we assume it + * doesn't matter what order the children are locked in, because the + * per-volume rename lock excludes other renames and no other + * operation locks two files in the same directory at once. (Note: if + * it turns out that link() does, link() is wrong.) + * + * Until such time as we can do lookups without the namei and lookup + * machinery "helpfully" locking the result vnode for us, we can't + * avoid tripping on cases where FROMCHILD == TOCHILD. Currently for + * non-directories we unlock the first one we lock while looking up + * the second, then relock it if necessary. This is more or less + * harmless since not much of interest can happen to the objects in + * that window while we have the containing directory locked; but it's + * not desirable and should be cleaned up when that becomes possible. + * The right way to do it is to check after looking the second one up + * and only lock it if it's different. (Note: for directories we don't + * do this dance because the same directory can't appear more than + * once.) + */ + +/* XXX following lifted from ufs_lookup.c */ +#define FSFMT(vp) (((vp)->v_mount->mnt_iflag & IMNT_DTYPE) == 0) + +/* + * Check if either entry referred to by FROM_ULR is within the range + * of entries named by TO_ULR. + */ +static int +ulr_overlap(const struct ufs_lookup_results *from_ulr, + const struct ufs_lookup_results *to_ulr) +{ + doff_t from_start, from_prevstart; + doff_t to_start, to_end; + + /* + * FROM is a DELETE result; offset points to the entry to + * remove and subtracting count gives the previous entry. + */ + from_start = from_ulr->ulr_offset - from_ulr->ulr_count; + from_prevstart = from_ulr->ulr_offset; + + /* + * TO is a RENAME (thus non-DELETE) result; offset points + * to the beginning of a region to write in, and adding + * count gives the end of the region. + */ + to_start = to_ulr->ulr_offset; + to_end = to_ulr->ulr_offset + to_ulr->ulr_count; + + if (from_prevstart >= to_start && from_prevstart < to_end) { + return 1; + } + if (from_start >= to_start && from_start < to_end) { + return 1; + } + return 0; +} + +/* + * Wrapper for relookup that also updates the supplemental results. + */ +static int +do_relookup(struct vnode *dvp, struct ufs_lookup_results *ulr, + struct vnode **vp, struct componentname *cnp) +{ + int error; + + error = relookup(dvp, vp, cnp, 0); + if (error) { + return error; + } + /* update the supplemental reasults */ + *ulr = VTOI(dvp)->i_crap; + UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); + return 0; +} + +/* + * Lock and relookup a sequence of two directories and two children. + * + */ +static int +lock_vnode_sequence(struct vnode *d1, struct ufs_lookup_results *ulr1, + struct vnode **v1_ret, struct componentname *cn1, + int v1_missing_ok, + int overlap_error, + struct vnode *d2, struct ufs_lookup_results *ulr2, + struct vnode **v2_ret, struct componentname *cn2, + int v2_missing_ok) +{ + struct vnode *v1, *v2; + int error; + + KASSERT(d1 != d2); + + vn_lock(d1, LK_EXCLUSIVE | LK_RETRY); + if (VTOI(d1)->i_size == 0) { + /* d1 has been rmdir'd */ + VOP_UNLOCK(d1); + return ENOENT; + } + error = do_relookup(d1, ulr1, &v1, cn1); + if (v1_missing_ok) { + if (error == ENOENT) { + /* + * Note: currently if the name doesn't exist, + * relookup succeeds (it intercepts the + * EJUSTRETURN from VOP_LOOKUP) and sets tvp + * to NULL. Therefore, we will never get + * ENOENT and this branch is not needed. + * However, in a saner future the EJUSTRETURN + * garbage will go away, so let's DTRT. + */ + v1 = NULL; + error = 0; + } + } else { + if (error == 0 && v1 == NULL) { + /* This is what relookup sets if v1 disappeared. */ + error = ENOENT; + } + } + if (error) { + VOP_UNLOCK(d1); + return error; + } + if (v1 && v1 == d2) { + VOP_UNLOCK(d1); + VOP_UNLOCK(v1); + vrele(v1); + return overlap_error; + } + + /* + * The right way to do this is to do lookups without locking + * the results, and lock the results afterwards; then at the + * end we can avoid trying to lock v2 if v2 == v1. + * + * However, for the reasons described in the fdvp == tdvp case + * in rename below, we can't do that safely. So, in the case + * where v1 is not a directory, unlock it and lock it again + * afterwards. This is safe in locking order because a + * non-directory can't be above anything else in the tree. If + * v1 *is* a directory, that's not true, but then because d1 + * != d2, v1 != v2. + */ + if (v1 && v1->v_type != VDIR) { + VOP_UNLOCK(v1); + } + vn_lock(d2, LK_EXCLUSIVE | LK_RETRY); + if (VTOI(d2)->i_size == 0) { + /* d2 has been rmdir'd */ + VOP_UNLOCK(d2); + if (v1 && v1->v_type == VDIR) { + VOP_UNLOCK(v1); + } + VOP_UNLOCK(d1); + if (v1) { + vrele(v1); + } + return ENOENT; + } + error = do_relookup(d2, ulr2, &v2, cn2); + if (v2_missing_ok) { + if (error == ENOENT) { + /* as above */ + v2 = NULL; + error = 0; + } + } else { + if (error == 0 && v2 == NULL) { + /* This is what relookup sets if v2 disappeared. */ + error = ENOENT; + } + } + if (error) { + VOP_UNLOCK(d2); + if (v1 && v1->v_type == VDIR) { + VOP_UNLOCK(v1); + } + VOP_UNLOCK(d1); + if (v1) { + vrele(v1); + } + return error; + } + if (v1 && v1->v_type != VDIR && v1 != v2) { + vn_lock(v1, LK_EXCLUSIVE | LK_RETRY); + } + *v1_ret = v1; + *v2_ret = v2; + return 0; +} + +/* + * Rename vnode operation + * rename("foo", "bar"); + * is essentially + * unlink("bar"); + * link("foo", "bar"); + * unlink("foo"); + * but ``atomically''. Can't do full commit without saving state in the + * inode on disk which isn't feasible at this time. Best we can do is + * always guarantee the target exists. + * + * Basic algorithm is: + * + * 1) Bump link count on source while we're linking it to the + * target. This also ensure the inode won't be deleted out + * from underneath us while we work (it may be truncated by + * a concurrent `trunc' or `open' for creation). + * 2) Link source to destination. If destination already exists, + * delete it first. + * 3) Unlink source reference to inode if still around. If a + * directory was moved and the parent of the destination + * is different from the source, patch the ".." entry in the + * directory. + */ +int +ufs_rename(void *v) +{ + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap = v; + struct vnode *tvp, *tdvp, *fvp, *fdvp; + struct componentname *tcnp, *fcnp; + struct inode *ip, *txp, *fxp, *tdp, *fdp; + struct mount *mp; + struct direct *newdir; + int doingdirectory, error; + ino_t oldparent, newparent; + + struct ufs_lookup_results from_ulr, to_ulr; + + tvp = ap->a_tvp; + tdvp = ap->a_tdvp; + fvp = ap->a_fvp; + fdvp = ap->a_fdvp; + tcnp = ap->a_tcnp; + fcnp = ap->a_fcnp; + doingdirectory = error = 0; + oldparent = newparent = 0; + + /* save the supplemental lookup results as they currently exist */ + from_ulr = VTOI(fdvp)->i_crap; + to_ulr = VTOI(tdvp)->i_crap; + UFS_CHECK_CRAPCOUNTER(VTOI(fdvp)); + UFS_CHECK_CRAPCOUNTER(VTOI(tdvp)); + + /* + * Owing to VFS oddities we are currently called with tdvp/tvp + * locked and not fdvp/fvp. In a sane world we'd be passed + * tdvp and fdvp only, unlocked, and two name strings. Pretend + * we have a sane world and unlock tdvp and tvp. + */ + VOP_UNLOCK(tdvp); + if (tvp && tvp != tdvp) { + VOP_UNLOCK(tvp); + } + + /* Also pretend we have a sane world and vrele fvp/tvp. */ + vrele(fvp); + fvp = NULL; + if (tvp) { + vrele(tvp); + tvp = NULL; + } + + /* + * Check for cross-device rename. + */ + if (fdvp->v_mount != tdvp->v_mount) { + error = EXDEV; + goto abort; + } + + /* + * Reject "." and ".." + */ + if ((fcnp->cn_flags & ISDOTDOT) || (tcnp->cn_flags & ISDOTDOT) || + (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || + (tcnp->cn_namelen == 1 && tcnp->cn_nameptr[0] == '.')) { + error = EINVAL; + goto abort; + } + + /* + * Get locks. + */ + + /* paranoia */ + fcnp->cn_flags |= LOCKPARENT|LOCKLEAF; + tcnp->cn_flags |= LOCKPARENT|LOCKLEAF; + + if (fdvp == tdvp) { + /* One directory. Lock it and relookup both children. */ + vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY); + + if (VTOI(fdvp)->i_size == 0) { + /* directory has been rmdir'd */ + VOP_UNLOCK(fdvp); + error = ENOENT; + goto abort; + } + + error = do_relookup(fdvp, &from_ulr, &fvp, fcnp); + if (error == 0 && fvp == NULL) { + /* relookup may produce this if fvp disappears */ + error = ENOENT; + } + if (error) { + VOP_UNLOCK(fdvp); + goto abort; + } + + /* + * The right way to do this is to look up both children + * without locking either, and then lock both unless they + * turn out to be the same. However, due to deep-seated + * VFS-level issues all lookups lock the child regardless + * of whether LOCKLEAF is set (if LOCKLEAF is not set, + * the child is locked during lookup and then unlocked) + * so it is not safe to look up tvp while fvp is locked. + * + * Unlocking fvp here temporarily is more or less safe, + * because with the directory locked there's not much + * that can happen to it. However, ideally it wouldn't + * be necessary. XXX. + */ + VOP_UNLOCK(fvp); + /* remember fdvp == tdvp so tdvp is locked */ + error = do_relookup(tdvp, &to_ulr, &tvp, tcnp); + if (error && error != ENOENT) { + VOP_UNLOCK(fdvp); + goto abort; + } + if (error == ENOENT) { + /* + * Note: currently if the name doesn't exist, + * relookup succeeds (it intercepts the + * EJUSTRETURN from VOP_LOOKUP) and sets tvp + * to NULL. Therefore, we will never get + * ENOENT and this branch is not needed. + * However, in a saner future the EJUSTRETURN + * garbage will go away, so let's DTRT. + */ + tvp = NULL; + } + + /* tvp is locked; lock fvp if necessary */ + if (!tvp || tvp != fvp) { + vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY); + } + } else { + int found_fdvp; + struct vnode *illegal_fvp; + + /* + * The source must not be above the destination. (If + * it were, the rename would detach a section of the + * tree.) + * + * Look up the tree from tdvp to see if we find fdvp, + * and if so, return the immediate child of fdvp we're + * under; that must not turn out to be the same as + * fvp. + * + * The per-volume rename lock guarantees that the + * result of this check remains true until we finish + * looking up and locking. + */ + error = ufs_parentcheck(fdvp, tdvp, fcnp->cn_cred, + &found_fdvp, &illegal_fvp); + if (error) { + goto abort; + } + + /* Must lock in tree order. */ + + if (found_fdvp) { + /* fdvp -> fvp -> tdvp -> tvp */ + error = lock_vnode_sequence(fdvp, &from_ulr, + &fvp, fcnp, 0, + EINVAL, + tdvp, &to_ulr, + &tvp, tcnp, 1); + } else { + /* tdvp -> tvp -> fdvp -> fvp */ + error = lock_vnode_sequence(tdvp, &to_ulr, + &tvp, tcnp, 1, + ENOTEMPTY, + fdvp, &from_ulr, + &fvp, fcnp, 0); + } + if (error) { + if (illegal_fvp) { + vrele(illegal_fvp); + } + goto abort; + } + KASSERT(fvp != NULL); + + if (illegal_fvp && fvp == illegal_fvp) { + vrele(illegal_fvp); + error = EINVAL; + goto abort_withlocks; + } + + if (illegal_fvp) { + vrele(illegal_fvp); + } + } + + KASSERT(fdvp && VOP_ISLOCKED(fdvp)); + KASSERT(fvp && VOP_ISLOCKED(fvp)); + KASSERT(tdvp && VOP_ISLOCKED(tdvp)); + KASSERT(tvp == NULL || VOP_ISLOCKED(tvp)); + + /* --- everything is now locked --- */ + + if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) || + (VTOI(tdvp)->i_flags & APPEND))) { + error = EPERM; + goto abort_withlocks; + } + + /* + * Check if just deleting a link name. + */ + if (fvp == tvp) { + if (fvp->v_type == VDIR) { + error = EINVAL; + goto abort_withlocks; + } + + /* Release destination completely. Leave fdvp locked. */ + VOP_ABORTOP(tdvp, tcnp); + if (fdvp != tdvp) { + VOP_UNLOCK(tdvp); + } + VOP_UNLOCK(tvp); + vrele(tdvp); + vrele(tvp); + + /* Delete source. */ + /* XXX: do we really need to relookup again? */ + + /* + * fdvp is still locked, but we just unlocked fvp + * (because fvp == tvp) so just decref fvp + */ + vrele(fvp); + fcnp->cn_flags &= ~(MODMASK); + fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; + fcnp->cn_nameiop = DELETE; + if ((error = relookup(fdvp, &fvp, fcnp, 0))) { + vput(fdvp); + return (error); + } + return (VOP_REMOVE(fdvp, fvp, fcnp)); + } + fdp = VTOI(fdvp); + ip = VTOI(fvp); + if ((nlink_t) ip->i_nlink >= LINK_MAX) { + error = EMLINK; + goto abort_withlocks; + } + if ((ip->i_flags & (IMMUTABLE | APPEND)) || + (fdp->i_flags & APPEND)) { + error = EPERM; + goto abort_withlocks; + } + if ((ip->i_mode & IFMT) == IFDIR) { + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || + fdp == ip || + (fcnp->cn_flags & ISDOTDOT) || + (tcnp->cn_flags & ISDOTDOT) || + (ip->i_flag & IN_RENAME)) { + error = EINVAL; + goto abort_withlocks; + } + ip->i_flag |= IN_RENAME; + doingdirectory = 1; + } + oldparent = fdp->i_number; + VN_KNOTE(fdvp, NOTE_WRITE); /* XXXLUKEM/XXX: right place? */ + + /* + * Both the directory + * and target vnodes are locked. + */ + tdp = VTOI(tdvp); + txp = NULL; + if (tvp) + txp = VTOI(tvp); + + mp = fdvp->v_mount; + fstrans_start(mp, FSTRANS_SHARED); + + if (oldparent != tdp->i_number) + newparent = tdp->i_number; + + /* + * If ".." must be changed (ie the directory gets a new + * parent) the user must have write permission in the source + * so as to be able to change "..". + */ + if (doingdirectory && newparent) { + error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred); + if (error) + goto out; + } + + KASSERT(fdvp != tvp); + + if (newparent) { + /* Check for the rename("foo/foo", "foo") case. */ + if (fdvp == tvp) { + error = doingdirectory ? ENOTEMPTY : EISDIR; + goto out; + } + } + + fxp = VTOI(fvp); + fdp = VTOI(fdvp); + + error = UFS_WAPBL_BEGIN(fdvp->v_mount); + if (error) + goto out2; + + /* + * 1) Bump link count while we're moving stuff + * around. If we crash somewhere before + * completing our work, the link count + * may be wrong, but correctable. + */ + ip->i_nlink++; + DIP_ASSIGN(ip, nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + if ((error = UFS_UPDATE(fvp, NULL, NULL, UPDATE_DIROP)) != 0) { + goto bad; + } + + /* + * 2) If target doesn't exist, link the target + * to the source and unlink the source. + * Otherwise, rewrite the target directory + * entry to reference the source inode and + * expunge the original entry's existence. + */ + if (txp == NULL) { + if (tdp->i_dev != ip->i_dev) + panic("rename: EXDEV"); + /* + * Account for ".." in new directory. + * When source and destination have the same + * parent we don't fool with the link count. + */ + if (doingdirectory && newparent) { + if ((nlink_t)tdp->i_nlink >= LINK_MAX) { + error = EMLINK; + goto bad; + } + tdp->i_nlink++; + DIP_ASSIGN(tdp, nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; + if ((error = UFS_UPDATE(tdvp, NULL, NULL, + UPDATE_DIROP)) != 0) { + tdp->i_nlink--; + DIP_ASSIGN(tdp, nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; + goto bad; + } + } + newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); + ufs_makedirentry(ip, tcnp, newdir); + error = ufs_direnter(tdvp, &to_ulr, + NULL, newdir, tcnp, NULL); + pool_cache_put(ufs_direct_cache, newdir); + if (error != 0) { + if (doingdirectory && newparent) { + tdp->i_nlink--; + DIP_ASSIGN(tdp, nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; + (void)UFS_UPDATE(tdvp, NULL, NULL, + UPDATE_WAIT | UPDATE_DIROP); + } + goto bad; + } + VN_KNOTE(tdvp, NOTE_WRITE); + } else { + if (txp->i_dev != tdp->i_dev || txp->i_dev != ip->i_dev) + panic("rename: EXDEV"); + /* + * Short circuit rename(foo, foo). + */ + if (txp->i_number == ip->i_number) + panic("rename: same file"); + /* + * If the parent directory is "sticky", then the user must + * own the parent directory, or the destination of the rename, + * otherwise the destination may not be changed (except by + * root). This implements append-only directories. + */ + if ((tdp->i_mode & S_ISTXT) && + kauth_authorize_generic(tcnp->cn_cred, + KAUTH_GENERIC_ISSUSER, NULL) != 0 && + kauth_cred_geteuid(tcnp->cn_cred) != tdp->i_uid && + txp->i_uid != kauth_cred_geteuid(tcnp->cn_cred)) { + error = EPERM; + goto bad; + } + /* + * Target must be empty if a directory and have no links + * to it. Also, ensure source and target are compatible + * (both directories, or both not directories). + */ + if ((txp->i_mode & IFMT) == IFDIR) { + if (txp->i_nlink > 2 || + !ufs_dirempty(txp, tdp->i_number, tcnp->cn_cred)) { + error = ENOTEMPTY; + goto bad; + } + if (!doingdirectory) { + error = ENOTDIR; + goto bad; + } + cache_purge(tdvp); + } else if (doingdirectory) { + error = EISDIR; + goto bad; + } + if ((error = ufs_dirrewrite(tdp, to_ulr.ulr_offset, + txp, ip->i_number, + IFTODT(ip->i_mode), doingdirectory && newparent ? + newparent : doingdirectory, IN_CHANGE | IN_UPDATE)) != 0) + goto bad; + if (doingdirectory) { + /* + * Truncate inode. The only stuff left in the directory + * is "." and "..". The "." reference is inconsequential + * since we are quashing it. We have removed the "." + * reference and the reference in the parent directory, + * but there may be other hard links. + */ + if (!newparent) { + tdp->i_nlink--; + DIP_ASSIGN(tdp, nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(tdvp, NULL, NULL, 0); + } + txp->i_nlink--; + DIP_ASSIGN(txp, nlink, txp->i_nlink); + txp->i_flag |= IN_CHANGE; + if ((error = UFS_TRUNCATE(tvp, (off_t)0, IO_SYNC, + tcnp->cn_cred))) + goto bad; + } + VN_KNOTE(tdvp, NOTE_WRITE); + VN_KNOTE(tvp, NOTE_DELETE); + } + + /* + * Handle case where the directory entry we need to remove, + * which is/was at from_ulr.ulr_offset, or the one before it, + * which is/was at from_ulr.ulr_offset - from_ulr.ulr_count, + * may have been moved when the directory insertion above + * performed compaction. + */ + if (tdp->i_number == fdp->i_number && + ulr_overlap(&from_ulr, &to_ulr)) { + + struct buf *bp; + struct direct *ep; + struct ufsmount *ump = fdp->i_ump; + doff_t curpos; + doff_t endsearch; /* offset to end directory search */ + uint32_t prev_reclen; + int dirblksiz = ump->um_dirblksiz; + const int needswap = UFS_MPNEEDSWAP(ump); + u_long bmask; + int namlen, entryoffsetinblock; + char *dirbuf; + + bmask = fdvp->v_mount->mnt_stat.f_iosize - 1; + + /* + * The fcnp entry will be somewhere between the start of + * compaction (to_ulr.ulr_offset) and the original location + * (from_ulr.ulr_offset). + */ + curpos = to_ulr.ulr_offset; + endsearch = from_ulr.ulr_offset + from_ulr.ulr_reclen; + entryoffsetinblock = 0; + + /* + * Get the directory block containing the start of + * compaction. + */ + error = ufs_blkatoff(fdvp, (off_t)to_ulr.ulr_offset, &dirbuf, + &bp, false); + if (error) + goto bad; + + /* + * Keep existing ulr_count (length of previous record) + * for the case where compaction did not include the + * previous entry but started at the from-entry. + */ + prev_reclen = from_ulr.ulr_count; + + while (curpos < endsearch) { + uint32_t reclen; + + /* + * If necessary, get the next directory block. + * + * dholland 7/13/11 to the best of my understanding + * this should never happen; compaction occurs only + * within single blocks. I think. + */ + if ((curpos & bmask) == 0) { + if (bp != NULL) + brelse(bp, 0); + error = ufs_blkatoff(fdvp, (off_t)curpos, + &dirbuf, &bp, false); + if (error) + goto bad; + entryoffsetinblock = 0; + } + + KASSERT(bp != NULL); + ep = (struct direct *)(dirbuf + entryoffsetinblock); + reclen = ufs_rw16(ep->d_reclen, needswap); + +#if (BYTE_ORDER == LITTLE_ENDIAN) + if (FSFMT(fdvp) && needswap == 0) + namlen = ep->d_type; + else + namlen = ep->d_namlen; +#else + if (FSFMT(fdvp) && needswap != 0) + namlen = ep->d_type; + else + namlen = ep->d_namlen; +#endif + if ((ep->d_ino != 0) && + (ufs_rw32(ep->d_ino, needswap) != WINO) && + (namlen == fcnp->cn_namelen) && + memcmp(ep->d_name, fcnp->cn_nameptr, namlen) == 0) { + from_ulr.ulr_reclen = reclen; + break; + } + curpos += reclen; + entryoffsetinblock += reclen; + prev_reclen = reclen; + } + + from_ulr.ulr_offset = curpos; + from_ulr.ulr_count = prev_reclen; + + KASSERT(curpos <= endsearch); + + /* + * If ulr_offset points to start of a directory block, + * clear ulr_count so ufs_dirremove() doesn't try to + * merge free space over a directory block boundary. + */ + if ((from_ulr.ulr_offset & (dirblksiz - 1)) == 0) + from_ulr.ulr_count = 0; + + brelse(bp, 0); + } + + /* + * 3) Unlink the source. + */ + +#if 0 + /* + * Ensure that the directory entry still exists and has not + * changed while the new name has been entered. If the source is + * a file then the entry may have been unlinked or renamed. In + * either case there is no further work to be done. If the source + * is a directory then it cannot have been rmdir'ed; The IRENAME + * flag ensures that it cannot be moved by another rename or removed + * by a rmdir. + */ +#endif + KASSERT(fxp == ip); + + /* + * If the source is a directory with a new parent, the link + * count of the old parent directory must be decremented and + * ".." set to point to the new parent. + */ + if (doingdirectory && newparent) { + KASSERT(fdp != NULL); + ufs_dirrewrite(fxp, mastertemplate.dot_reclen, + fdp, newparent, DT_DIR, 0, IN_CHANGE); + cache_purge(fdvp); + } + error = ufs_dirremove(fdvp, &from_ulr, + fxp, fcnp->cn_flags, 0); + fxp->i_flag &= ~IN_RENAME; + + VN_KNOTE(fvp, NOTE_RENAME); + goto done; + + out: + goto out2; + + /* exit routines from steps 1 & 2 */ + bad: + if (doingdirectory) + ip->i_flag &= ~IN_RENAME; + ip->i_nlink--; + DIP_ASSIGN(ip, nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + ip->i_flag &= ~IN_RENAME; + UFS_WAPBL_UPDATE(fvp, NULL, NULL, 0); + done: + UFS_WAPBL_END(fdvp->v_mount); + out2: + /* + * clear IN_RENAME - some exit paths happen too early to go + * through the cleanup done in the "bad" case above, so we + * always do this mini-cleanup here. + */ + ip->i_flag &= ~IN_RENAME; + + VOP_UNLOCK(fdvp); + if (tdvp != fdvp) { + VOP_UNLOCK(tdvp); + } + VOP_UNLOCK(fvp); + if (tvp && tvp != fvp) { + VOP_UNLOCK(tvp); + } + + vrele(fdvp); + vrele(tdvp); + vrele(fvp); + if (tvp) { + vrele(tvp); + } + + fstrans_done(mp); + return (error); + + abort_withlocks: + VOP_UNLOCK(fdvp); + if (tdvp != fdvp) { + VOP_UNLOCK(tdvp); + } + VOP_UNLOCK(fvp); + if (tvp && tvp != fvp) { + VOP_UNLOCK(tvp); + } + + abort: + VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */ + VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */ + vrele(tdvp); + if (tvp) { + vrele(tvp); + } + vrele(fdvp); + if (fvp) { + vrele(fvp); + } + return (error); +} + +int +ufs_mkdir(void *v) +{ + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap = v; + struct vnode *dvp = ap->a_dvp, *tvp; + struct vattr *vap = ap->a_vap; + struct componentname *cnp = ap->a_cnp; + struct inode *ip, *dp = VTOI(dvp); + struct buf *bp; + struct dirtemplate dirtemplate; + struct direct *newdir; + int error, dmode; + struct ufsmount *ump = dp->i_ump; + int dirblksiz = ump->um_dirblksiz; + struct ufs_lookup_results *ulr; + + fstrans_start(dvp->v_mount, FSTRANS_SHARED); + + /* XXX should handle this material another way */ + ulr = &dp->i_crap; + UFS_CHECK_CRAPCOUNTER(dp); + + if ((nlink_t)dp->i_nlink >= LINK_MAX) { + error = EMLINK; + goto out; + } + dmode = vap->va_mode & ACCESSPERMS; + dmode |= IFDIR; + /* + * Must simulate part of ufs_makeinode here to acquire the inode, + * but not have it entered in the parent directory. The entry is + * made later after writing "." and ".." entries. + */ + if ((error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, ap->a_vpp)) != 0) + goto out; + + tvp = *ap->a_vpp; + ip = VTOI(tvp); + + error = UFS_WAPBL_BEGIN(ap->a_dvp->v_mount); + if (error) { + UFS_VFREE(tvp, ip->i_number, dmode); + vput(tvp); + goto out; + } + ip->i_uid = kauth_cred_geteuid(cnp->cn_cred); + DIP_ASSIGN(ip, uid, ip->i_uid); + ip->i_gid = dp->i_gid; + DIP_ASSIGN(ip, gid, ip->i_gid); +#if defined(QUOTA) || defined(QUOTA2) + if ((error = chkiq(ip, 1, cnp->cn_cred, 0))) { + UFS_VFREE(tvp, ip->i_number, dmode); + UFS_WAPBL_END(dvp->v_mount); + fstrans_done(dvp->v_mount); + vput(tvp); + vput(dvp); + return (error); + } +#endif + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + ip->i_mode = dmode; + DIP_ASSIGN(ip, mode, dmode); + tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ + ip->i_nlink = 2; + DIP_ASSIGN(ip, nlink, 2); + if (cnp->cn_flags & ISWHITEOUT) { + ip->i_flags |= UF_OPAQUE; + DIP_ASSIGN(ip, flags, ip->i_flags); + } + + /* + * Bump link count in parent directory to reflect work done below. + * Should be done before reference is created so cleanup is + * possible if we crash. + */ + dp->i_nlink++; + DIP_ASSIGN(dp, nlink, dp->i_nlink); + dp->i_flag |= IN_CHANGE; + if ((error = UFS_UPDATE(dvp, NULL, NULL, UPDATE_DIROP)) != 0) + goto bad; + + /* + * Initialize directory with "." and ".." from static template. + */ + dirtemplate = mastertemplate; + dirtemplate.dotdot_reclen = dirblksiz - dirtemplate.dot_reclen; + dirtemplate.dot_ino = ufs_rw32(ip->i_number, UFS_MPNEEDSWAP(ump)); + dirtemplate.dotdot_ino = ufs_rw32(dp->i_number, UFS_MPNEEDSWAP(ump)); + dirtemplate.dot_reclen = ufs_rw16(dirtemplate.dot_reclen, + UFS_MPNEEDSWAP(ump)); + dirtemplate.dotdot_reclen = ufs_rw16(dirtemplate.dotdot_reclen, + UFS_MPNEEDSWAP(ump)); + if (ump->um_maxsymlinklen <= 0) { +#if BYTE_ORDER == LITTLE_ENDIAN + if (UFS_MPNEEDSWAP(ump) == 0) +#else + if (UFS_MPNEEDSWAP(ump) != 0) +#endif + { + dirtemplate.dot_type = dirtemplate.dot_namlen; + dirtemplate.dotdot_type = dirtemplate.dotdot_namlen; + dirtemplate.dot_namlen = dirtemplate.dotdot_namlen = 0; + } else + dirtemplate.dot_type = dirtemplate.dotdot_type = 0; + } + if ((error = UFS_BALLOC(tvp, (off_t)0, dirblksiz, cnp->cn_cred, + B_CLRBUF, &bp)) != 0) + goto bad; + ip->i_size = dirblksiz; + DIP_ASSIGN(ip, size, dirblksiz); + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + uvm_vnp_setsize(tvp, ip->i_size); + memcpy((void *)bp->b_data, (void *)&dirtemplate, sizeof dirtemplate); + + /* + * Directory set up, now install it's entry in the parent directory. + * We must write out the buffer containing the new directory body + * before entering the new name in the parent. + */ + if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) + goto bad; + if ((error = UFS_UPDATE(tvp, NULL, NULL, UPDATE_DIROP)) != 0) { + goto bad; + } + newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); + ufs_makedirentry(ip, cnp, newdir); + error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, bp); + pool_cache_put(ufs_direct_cache, newdir); + bad: + if (error == 0) { + VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); + UFS_WAPBL_END(dvp->v_mount); + } else { + dp->i_nlink--; + DIP_ASSIGN(dp, nlink, dp->i_nlink); + dp->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP); + /* + * No need to do an explicit UFS_TRUNCATE here, vrele will + * do this for us because we set the link count to 0. + */ + ip->i_nlink = 0; + DIP_ASSIGN(ip, nlink, 0); + ip->i_flag |= IN_CHANGE; + /* If IN_ADIROP, account for it */ + UFS_UNMARK_VNODE(tvp); + UFS_WAPBL_UPDATE(tvp, NULL, NULL, UPDATE_DIROP); + UFS_WAPBL_END(dvp->v_mount); + vput(tvp); + } + out: + fstrans_done(dvp->v_mount); + vput(dvp); + return (error); +} + +int +ufs_rmdir(void *v) +{ + struct vop_rmdir_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap = v; + struct vnode *vp, *dvp; + struct componentname *cnp; + struct inode *ip, *dp; + int error; + struct ufs_lookup_results *ulr; + + vp = ap->a_vp; + dvp = ap->a_dvp; + cnp = ap->a_cnp; + ip = VTOI(vp); + dp = VTOI(dvp); + + /* XXX should handle this material another way */ + ulr = &dp->i_crap; + UFS_CHECK_CRAPCOUNTER(dp); + + /* + * No rmdir "." or of mounted directories please. + */ + if (dp == ip || vp->v_mountedhere != NULL) { + if (dp == ip) + vrele(dvp); + else + vput(dvp); + vput(vp); + return (EINVAL); + } + + fstrans_start(dvp->v_mount, FSTRANS_SHARED); + + /* + * Do not remove a directory that is in the process of being renamed. + * Verify that the directory is empty (and valid). (Rmdir ".." won't + * be valid since ".." will contain a reference to the current + * directory and thus be non-empty.) + */ + error = 0; + if (ip->i_flag & IN_RENAME) { + error = EINVAL; + goto out; + } + if (ip->i_nlink != 2 || + !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { + error = ENOTEMPTY; + goto out; + } + if ((dp->i_flags & APPEND) || + (ip->i_flags & (IMMUTABLE | APPEND))) { + error = EPERM; + goto out; + } + error = UFS_WAPBL_BEGIN(dvp->v_mount); + if (error) + goto out; + /* + * Delete reference to directory before purging + * inode. If we crash in between, the directory + * will be reattached to lost+found, + */ + error = ufs_dirremove(dvp, ulr, ip, cnp->cn_flags, 1); + if (error) { + UFS_WAPBL_END(dvp->v_mount); + goto out; + } + VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); + cache_purge(dvp); + /* + * Truncate inode. The only stuff left in the directory is "." and + * "..". The "." reference is inconsequential since we're quashing + * it. + */ + dp->i_nlink--; + DIP_ASSIGN(dp, nlink, dp->i_nlink); + dp->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP); + ip->i_nlink--; + DIP_ASSIGN(ip, nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + error = UFS_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred); + cache_purge(vp); + /* + * Unlock the log while we still have reference to unlinked + * directory vp so that it will not get locked for recycling + */ + UFS_WAPBL_END(dvp->v_mount); +#ifdef UFS_DIRHASH + if (ip->i_dirhash != NULL) + ufsdirhash_free(ip); +#endif + out: + VN_KNOTE(vp, NOTE_DELETE); + vput(vp); + fstrans_done(dvp->v_mount); + vput(dvp); + return (error); +} + +/* + * symlink -- make a symbolic link + */ +int +ufs_symlink(void *v) +{ + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap = v; + struct vnode *vp, **vpp; + struct inode *ip; + int len, error; + struct ufs_lookup_results *ulr; + + vpp = ap->a_vpp; + + /* XXX should handle this material another way */ + ulr = &VTOI(ap->a_dvp)->i_crap; + UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp)); + + /* + * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful + * ufs_makeinode + */ + fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED); + error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, ulr, + vpp, ap->a_cnp); + if (error) + goto out; + VN_KNOTE(ap->a_dvp, NOTE_WRITE); + vp = *vpp; + len = strlen(ap->a_target); + ip = VTOI(vp); + if (len < ip->i_ump->um_maxsymlinklen) { + memcpy((char *)SHORTLINK(ip), ap->a_target, len); + ip->i_size = len; + DIP_ASSIGN(ip, size, len); + uvm_vnp_setsize(vp, ip->i_size); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (vp->v_mount->mnt_flag & MNT_RELATIME) + ip->i_flag |= IN_ACCESS; + UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); + } else + error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, + UIO_SYSSPACE, IO_NODELOCKED | IO_JOURNALLOCKED, + ap->a_cnp->cn_cred, NULL, NULL); + UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp); + if (error) + vput(vp); +out: + fstrans_done(ap->a_dvp->v_mount); + return (error); +} + +/* + * Vnode op for reading directories. + * + * This routine handles converting from the on-disk directory format + * "struct direct" to the in-memory format "struct dirent" as well as + * byte swapping the entries if necessary. + */ +int +ufs_readdir(void *v) +{ + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + kauth_cred_t a_cred; + int *a_eofflag; + off_t **a_cookies; + int *ncookies; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct direct *cdp, *ecdp; + struct dirent *ndp; + char *cdbuf, *ndbuf, *endp; + struct uio auio, *uio; + struct iovec aiov; + int error; + size_t count, ccount, rcount; + off_t off, *ccp; + off_t startoff; + size_t skipbytes; + struct ufsmount *ump = VFSTOUFS(vp->v_mount); + int nswap = UFS_MPNEEDSWAP(ump); +#if BYTE_ORDER == LITTLE_ENDIAN + int needswap = ump->um_maxsymlinklen <= 0 && nswap == 0; +#else + int needswap = ump->um_maxsymlinklen <= 0 && nswap != 0; +#endif + uio = ap->a_uio; + count = uio->uio_resid; + rcount = count - ((uio->uio_offset + count) & (ump->um_dirblksiz - 1)); + + if (rcount < _DIRENT_MINSIZE(cdp) || count < _DIRENT_MINSIZE(ndp)) + return EINVAL; + + startoff = uio->uio_offset & ~(ump->um_dirblksiz - 1); + skipbytes = uio->uio_offset - startoff; + rcount += skipbytes; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = startoff; + auio.uio_resid = rcount; + UIO_SETUP_SYSSPACE(&auio); + auio.uio_rw = UIO_READ; + cdbuf = malloc(rcount, M_TEMP, M_WAITOK); + aiov.iov_base = cdbuf; + aiov.iov_len = rcount; + error = VOP_READ(vp, &auio, 0, ap->a_cred); + if (error != 0) { + free(cdbuf, M_TEMP); + return error; + } + + rcount -= auio.uio_resid; + + cdp = (struct direct *)(void *)cdbuf; + ecdp = (struct direct *)(void *)&cdbuf[rcount]; + + ndbuf = malloc(count, M_TEMP, M_WAITOK); + ndp = (struct dirent *)(void *)ndbuf; + endp = &ndbuf[count]; + + off = uio->uio_offset; + if (ap->a_cookies) { + ccount = rcount / _DIRENT_RECLEN(cdp, 1); + ccp = *(ap->a_cookies) = malloc(ccount * sizeof(*ccp), + M_TEMP, M_WAITOK); + } else { + /* XXX: GCC */ + ccount = 0; + ccp = NULL; + } + + while (cdp < ecdp) { + cdp->d_reclen = ufs_rw16(cdp->d_reclen, nswap); + if (skipbytes > 0) { + if (cdp->d_reclen <= skipbytes) { + skipbytes -= cdp->d_reclen; + cdp = _DIRENT_NEXT(cdp); + continue; + } + /* + * invalid cookie. + */ + error = EINVAL; + goto out; + } + if (cdp->d_reclen == 0) { + struct dirent *ondp = ndp; + ndp->d_reclen = _DIRENT_MINSIZE(ndp); + ndp = _DIRENT_NEXT(ndp); + ondp->d_reclen = 0; + cdp = ecdp; + break; + } + if (needswap) { + ndp->d_type = cdp->d_namlen; + ndp->d_namlen = cdp->d_type; + } else { + ndp->d_type = cdp->d_type; + ndp->d_namlen = cdp->d_namlen; + } + ndp->d_reclen = _DIRENT_RECLEN(ndp, ndp->d_namlen); + if ((char *)(void *)ndp + ndp->d_reclen + + _DIRENT_MINSIZE(ndp) > endp) + break; + ndp->d_fileno = ufs_rw32(cdp->d_ino, nswap); + (void)memcpy(ndp->d_name, cdp->d_name, ndp->d_namlen); + memset(&ndp->d_name[ndp->d_namlen], 0, + ndp->d_reclen - _DIRENT_NAMEOFF(ndp) - ndp->d_namlen); + off += cdp->d_reclen; + if (ap->a_cookies) { + KASSERT(ccp - *(ap->a_cookies) < ccount); + *(ccp++) = off; + } + ndp = _DIRENT_NEXT(ndp); + cdp = _DIRENT_NEXT(cdp); + } + + count = ((char *)(void *)ndp - ndbuf); + error = uiomove(ndbuf, count, uio); +out: + if (ap->a_cookies) { + if (error) { + free(*(ap->a_cookies), M_TEMP); + *(ap->a_cookies) = NULL; + *(ap->a_ncookies) = 0; + } else { + *ap->a_ncookies = ccp - *(ap->a_cookies); + } + } + uio->uio_offset = off; + free(ndbuf, M_TEMP); + free(cdbuf, M_TEMP); + *ap->a_eofflag = VTOI(vp)->i_size <= uio->uio_offset; + return error; +} + +/* + * Return target name of a symbolic link + */ +int +ufs_readlink(void *v) +{ + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct ufsmount *ump = VFSTOUFS(vp->v_mount); + int isize; + + isize = ip->i_size; + if (isize < ump->um_maxsymlinklen || + (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0)) { + uiomove((char *)SHORTLINK(ip), isize, ap->a_uio); + return (0); + } + return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); +} + +/* + * Calculate the logical to physical mapping if not done already, + * then call the device strategy routine. + */ +int +ufs_strategy(void *v) +{ + struct vop_strategy_args /* { + struct vnode *a_vp; + struct buf *a_bp; + } */ *ap = v; + struct buf *bp; + struct vnode *vp; + struct inode *ip; + struct mount *mp; + int error; + + bp = ap->a_bp; + vp = ap->a_vp; + ip = VTOI(vp); + if (vp->v_type == VBLK || vp->v_type == VCHR) + panic("ufs_strategy: spec"); + KASSERT(bp->b_bcount != 0); + if (bp->b_blkno == bp->b_lblkno) { + error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, + NULL); + if (error) { + bp->b_error = error; + biodone(bp); + return (error); + } + if (bp->b_blkno == -1) /* no valid data */ + clrbuf(bp); + } + if (bp->b_blkno < 0) { /* block is not on disk */ + biodone(bp); + return (0); + } + vp = ip->i_devvp; + + error = VOP_STRATEGY(vp, bp); + if (error) + return error; + + if (!BUF_ISREAD(bp)) + return 0; + + mp = wapbl_vptomp(vp); + if (mp == NULL || mp->mnt_wapbl_replay == NULL || + !WAPBL_REPLAY_ISOPEN(mp) || + !WAPBL_REPLAY_CAN_READ(mp, bp->b_blkno, bp->b_bcount)) + return 0; + + error = biowait(bp); + if (error) + return error; + + error = WAPBL_REPLAY_READ(mp, bp->b_data, bp->b_blkno, bp->b_bcount); + if (error) { + mutex_enter(&bufcache_lock); + SET(bp->b_cflags, BC_INVAL); + mutex_exit(&bufcache_lock); + } + return error; +} + +/* + * Print out the contents of an inode. + */ +int +ufs_print(void *v) +{ + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap = v; + struct vnode *vp; + struct inode *ip; + + vp = ap->a_vp; + ip = VTOI(vp); + printf("tag VT_UFS, ino %llu, on dev %llu, %llu", + (unsigned long long)ip->i_number, + (unsigned long long)major(ip->i_dev), + (unsigned long long)minor(ip->i_dev)); + printf(" flags 0x%x, nlink %d\n", + ip->i_flag, ip->i_nlink); + printf("\tmode 0%o, owner %d, group %d, size %qd", + ip->i_mode, ip->i_uid, ip->i_gid, + (long long)ip->i_size); + if (vp->v_type == VFIFO) + VOCALL(fifo_vnodeop_p, VOFFSET(vop_print), v); + printf("\n"); + return (0); +} + +/* + * Read wrapper for special devices. + */ +int +ufsspec_read(void *v) +{ + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + kauth_cred_t a_cred; + } */ *ap = v; + + /* + * Set access flag. + */ + if ((ap->a_vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0) + VTOI(ap->a_vp)->i_flag |= IN_ACCESS; + return (VOCALL (spec_vnodeop_p, VOFFSET(vop_read), ap)); +} + +/* + * Write wrapper for special devices. + */ +int +ufsspec_write(void *v) +{ + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + kauth_cred_t a_cred; + } */ *ap = v; + + /* + * Set update and change flags. + */ + if ((ap->a_vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0) + VTOI(ap->a_vp)->i_flag |= IN_MODIFY; + return (VOCALL (spec_vnodeop_p, VOFFSET(vop_write), ap)); +} + +/* + * Close wrapper for special devices. + * + * Update the times on the inode then do device close. + */ +int +ufsspec_close(void *v) +{ + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp; + struct inode *ip; + + vp = ap->a_vp; + ip = VTOI(vp); + if (vp->v_usecount > 1) + UFS_ITIMES(vp, NULL, NULL, NULL); + return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap)); +} + +/* + * Read wrapper for fifo's + */ +int +ufsfifo_read(void *v) +{ + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + kauth_cred_t a_cred; + } */ *ap = v; + + /* + * Set access flag. + */ + VTOI(ap->a_vp)->i_flag |= IN_ACCESS; + return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_read), ap)); +} + +/* + * Write wrapper for fifo's. + */ +int +ufsfifo_write(void *v) +{ + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + kauth_cred_t a_cred; + } */ *ap = v; + + /* + * Set update and change flags. + */ + VTOI(ap->a_vp)->i_flag |= IN_MODIFY; + return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_write), ap)); +} + +/* + * Close wrapper for fifo's. + * + * Update the times on the inode then do device close. + */ +int +ufsfifo_close(void *v) +{ + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + kauth_cred_t a_cred; + } */ *ap = v; + struct vnode *vp; + struct inode *ip; + + vp = ap->a_vp; + ip = VTOI(vp); + if (ap->a_vp->v_usecount > 1) + UFS_ITIMES(vp, NULL, NULL, NULL); + return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap)); +} + +/* + * Return POSIX pathconf information applicable to ufs filesystems. + */ +int +ufs_pathconf(void *v) +{ + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + register_t *a_retval; + } */ *ap = v; + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_NAME_MAX: + *ap->a_retval = FFS_MAXNAMLEN; + return (0); + case _PC_PATH_MAX: + *ap->a_retval = PATH_MAX; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + case _PC_NO_TRUNC: + *ap->a_retval = 1; + return (0); + case _PC_SYNC_IO: + *ap->a_retval = 1; + return (0); + case _PC_FILESIZEBITS: + *ap->a_retval = 42; + return (0); + case _PC_SYMLINK_MAX: + *ap->a_retval = MAXPATHLEN; + return (0); + case _PC_2_SYMLINKS: + *ap->a_retval = 1; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Advisory record locking support + */ +int +ufs_advlock(void *v) +{ + struct vop_advlock_args /* { + struct vnode *a_vp; + void * a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap = v; + struct inode *ip; + + ip = VTOI(ap->a_vp); + return lf_advlock(ap, &ip->i_lockf, ip->i_size); +} + +/* + * Initialize the vnode associated with a new inode, handle aliased + * vnodes. + */ +void +ufs_vinit(struct mount *mntp, int (**specops)(void *), int (**fifoops)(void *), + struct vnode **vpp) +{ + struct timeval tv; + struct inode *ip; + struct vnode *vp; + dev_t rdev; + struct ufsmount *ump; + + vp = *vpp; + ip = VTOI(vp); + switch(vp->v_type = IFTOVT(ip->i_mode)) { + case VCHR: + case VBLK: + vp->v_op = specops; + ump = ip->i_ump; + if (ump->um_fstype == UFS1) + rdev = (dev_t)ufs_rw32(ip->i_ffs1_rdev, + UFS_MPNEEDSWAP(ump)); + else + rdev = (dev_t)ufs_rw64(ip->i_ffs2_rdev, + UFS_MPNEEDSWAP(ump)); + spec_node_init(vp, rdev); + break; + case VFIFO: + vp->v_op = fifoops; + break; + case VNON: + case VBAD: + case VSOCK: + case VLNK: + case VDIR: + case VREG: + break; + } + if (ip->i_number == ROOTINO) + vp->v_vflag |= VV_ROOT; + /* + * Initialize modrev times + */ + getmicrouptime(&tv); + ip->i_modrev = (uint64_t)(uint)tv.tv_sec << 32 + | tv.tv_usec * 4294u; + *vpp = vp; +} + +/* + * Allocate a new inode. + */ +int +ufs_makeinode(int mode, struct vnode *dvp, const struct ufs_lookup_results *ulr, + struct vnode **vpp, struct componentname *cnp) +{ + struct inode *ip, *pdir; + struct direct *newdir; + struct vnode *tvp; + int error, ismember = 0; + + UFS_WAPBL_JUNLOCK_ASSERT(dvp->v_mount); + + pdir = VTOI(dvp); + + if ((mode & IFMT) == 0) + mode |= IFREG; + + if ((error = UFS_VALLOC(dvp, mode, cnp->cn_cred, vpp)) != 0) { + vput(dvp); + return (error); + } + tvp = *vpp; + ip = VTOI(tvp); + ip->i_gid = pdir->i_gid; + DIP_ASSIGN(ip, gid, ip->i_gid); + ip->i_uid = kauth_cred_geteuid(cnp->cn_cred); + DIP_ASSIGN(ip, uid, ip->i_uid); + error = UFS_WAPBL_BEGIN1(dvp->v_mount, dvp); + if (error) { + /* + * Note, we can't VOP_VFREE(tvp) here like we should + * because we can't write to the disk. Instead, we leave + * the vnode dangling from the journal. + */ + vput(tvp); + vput(dvp); + return (error); + } +#if defined(QUOTA) || defined(QUOTA2) + if ((error = chkiq(ip, 1, cnp->cn_cred, 0))) { + UFS_VFREE(tvp, ip->i_number, mode); + UFS_WAPBL_END1(dvp->v_mount, dvp); + vput(tvp); + vput(dvp); + return (error); + } +#endif + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + ip->i_mode = mode; + DIP_ASSIGN(ip, mode, mode); + tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ + ip->i_nlink = 1; + DIP_ASSIGN(ip, nlink, 1); + if ((ip->i_mode & ISGID) && (kauth_cred_ismember_gid(cnp->cn_cred, + ip->i_gid, &ismember) != 0 || !ismember) && + kauth_authorize_generic(cnp->cn_cred, KAUTH_GENERIC_ISSUSER, NULL)) { + ip->i_mode &= ~ISGID; + DIP_ASSIGN(ip, mode, ip->i_mode); + } + + if (cnp->cn_flags & ISWHITEOUT) { + ip->i_flags |= UF_OPAQUE; + DIP_ASSIGN(ip, flags, ip->i_flags); + } + + /* + * Make sure inode goes to disk before directory entry. + */ + if ((error = UFS_UPDATE(tvp, NULL, NULL, UPDATE_DIROP)) != 0) + goto bad; + newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); + ufs_makedirentry(ip, cnp, newdir); + error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, NULL); + pool_cache_put(ufs_direct_cache, newdir); + if (error) + goto bad; + vput(dvp); + *vpp = tvp; + return (0); + + bad: + /* + * Write error occurred trying to update the inode + * or the directory so must deallocate the inode. + */ + ip->i_nlink = 0; + DIP_ASSIGN(ip, nlink, 0); + ip->i_flag |= IN_CHANGE; + /* If IN_ADIROP, account for it */ + UFS_UNMARK_VNODE(tvp); + UFS_WAPBL_UPDATE(tvp, NULL, NULL, 0); + tvp->v_type = VNON; /* explodes later if VBLK */ + UFS_WAPBL_END1(dvp->v_mount, dvp); + vput(tvp); + vput(dvp); + return (error); +} + +/* + * Allocate len bytes at offset off. + */ +int +ufs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags, + kauth_cred_t cred) +{ + struct inode *ip = VTOI(vp); + int error, delta, bshift, bsize; + UVMHIST_FUNC("ufs_gop_alloc"); UVMHIST_CALLED(ubchist); + + error = 0; + bshift = vp->v_mount->mnt_fs_bshift; + bsize = 1 << bshift; + + delta = off & (bsize - 1); + off -= delta; + len += delta; + + while (len > 0) { + bsize = MIN(bsize, len); + + error = UFS_BALLOC(vp, off, bsize, cred, flags, NULL); + if (error) { + goto out; + } + + /* + * increase file size now, UFS_BALLOC() requires that + * EOF be up-to-date before each call. + */ + + if (ip->i_size < off + bsize) { + UVMHIST_LOG(ubchist, "vp %p old 0x%x new 0x%x", + vp, ip->i_size, off + bsize, 0); + ip->i_size = off + bsize; + DIP_ASSIGN(ip, size, ip->i_size); + } + + off += bsize; + len -= bsize; + } + +out: + UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); + return error; +} + +void +ufs_gop_markupdate(struct vnode *vp, int flags) +{ + u_int32_t mask = 0; + + if ((flags & GOP_UPDATE_ACCESSED) != 0) { + mask = IN_ACCESS; + } + if ((flags & GOP_UPDATE_MODIFIED) != 0) { + if (vp->v_type == VREG) { + mask |= IN_CHANGE | IN_UPDATE; + } else { + mask |= IN_MODIFY; + } + } + if (mask) { + struct inode *ip = VTOI(vp); + + ip->i_flag |= mask; + } +} diff --git a/sys/ufs/ufs/ufs_wapbl.c b/sys/ufs/ufs/ufs_wapbl.c new file mode 100644 index 000000000..1f11526e7 --- /dev/null +++ b/sys/ufs/ufs/ufs_wapbl.c @@ -0,0 +1,166 @@ +/* $NetBSD: ufs_wapbl.c,v 1.22 2011/07/18 06:46:05 dholland Exp $ */ + +/*- + * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1982, 1986, 1989, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_vnops.c 8.28 (Berkeley) 7/31/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ufs_wapbl.c,v 1.22 2011/07/18 06:46:05 dholland Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef WAPBL_DEBUG_INODES +#error WAPBL_DEBUG_INODES: not functional before ufs_wapbl.c is updated +void +ufs_wapbl_verify_inodes(struct mount *mp, const char *str) +{ + struct vnode *vp, *nvp; + struct inode *ip; + struct buf *bp, *nbp; + + mutex_enter(&mntvnode_lock); + loop: + TAILQ_FOREACH_REVERSE(vp, &mp->mnt_vnodelist, vnodelst, v_mntvnodes) { + /* + * If the vnode that we are about to sync is no longer + * associated with this mount point, start over. + */ + if (vp->v_mount != mp) + goto loop; + mutex_enter(&vp->v_interlock); + nvp = TAILQ_NEXT(vp, v_mntvnodes); + ip = VTOI(vp); + if (vp->v_type == VNON) { + mutex_exit(&vp->v_interlock); + continue; + } + /* verify that update has been called on all inodes */ + if (ip->i_flag & (IN_CHANGE | IN_UPDATE)) { + panic("wapbl_verify: mp %p: dirty vnode %p (inode %p): 0x%x\n", + mp, vp, ip, ip->i_flag); + } + mutex_exit(&mntvnode_lock); + + mutex_enter(&bufcache_lock); + for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); + if ((bp->b_cflags & BC_BUSY)) { + continue; + } + KASSERT((bp->b_oflags & BO_DELWRI) != 0); + KASSERT((bp->b_flags & B_LOCKED) != 0); + } + mutex_exit(&bufcache_lock); + mutex_exit(&vp->v_interlock); + + mutex_enter(&mntvnode_lock); + } + mutex_exit(&mntvnode_lock); + + vp = VFSTOUFS(mp)->um_devvp; + mutex_enter(&vp->v_interlock); + mutex_enter(&bufcache_lock); + for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); + if ((bp->b_cflags & BC_BUSY)) { + continue; + } + KASSERT((bp->b_oflags & BO_DELWRI) != 0); + KASSERT((bp->b_flags & B_LOCKED) != 0); + } + mutex_exit(&bufcache_lock); + mutex_exit(&vp->v_interlock); +} +#endif /* WAPBL_DEBUG_INODES */ diff --git a/include/ufs/ufs/ufs_wapbl.h b/sys/ufs/ufs/ufs_wapbl.h similarity index 100% rename from include/ufs/ufs/ufs_wapbl.h rename to sys/ufs/ufs/ufs_wapbl.h diff --git a/include/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h similarity index 100% rename from include/ufs/ufs/ufsmount.h rename to sys/ufs/ufs/ufsmount.h diff --git a/tools/nbsd_ports b/tools/nbsd_ports index ea9ee9469..bf059ec0b 100644 --- a/tools/nbsd_ports +++ b/tools/nbsd_ports @@ -2,16 +2,16 @@ # Timestamp in UTC,minixpath,netbsdpath # minixpath: path in Minix source tree (starting from /usr/src/) # netbsdpath: path in BSD source tree (starting from src/) +2011/12/25 06:09:09,sys/arch/i386/stand 2012/02/10 16:16:12,share/zoneinfo 2011/05/26 00:00:00,external/public-domain/xz 2011/09/30 01:32:21,usr.bin/gzip 2011/08/27 12:55:09,bin/date 2011/10/17 09:24:54,common/lib/libprop -2011/11/28 12:50:07,include/ufs,sys/ufs +2011/11/28 12:50:07,sys/ufs 2010/09/10 15:51:20,sbin/newfs_ext2fs 2011/09/16 16:13:18,sbin/fsck_ext2fs 2011/09/30 22:08:19,lib/libprop -2011/08/30 12:39:55,common/include/arch/i386,sys/arch/i386/include 2011/11/13 22:19:09,common/include 2011/01/17 18:11:10,common/lib/libc 2011/01/21 23:36:49,lib/libc @@ -40,7 +40,7 @@ 2011/09/01 13:37:33,usr.bin/du 2010/07/07 21:24:34,usr.bin/man 2009/05/08 12:48:43,usr.bin/apropos -2011/01/12 23:02:22,usr.bin/mdocml,external/bsd/mdocml +2011/01/12 23:02:22,external/bsd/mdocml 2011/11/03 20:46:41,usr.sbin/installboot 2011/01/04 10:01:51,usr.sbin/pwd_mkdb 2011/01/04 10:30:21,usr.sbin/user @@ -50,10 +50,5 @@ 2007/05/28 12:06:25,usr.bin/bzip2recover 2009/04/02 21:39:33,libexec/makewhatis 2010/05/14 16:43:34,dist/bzip2 -2011/08/17 00:07:38,sys/arch/i386/stand/bootxx -2011/12/25 06:09:09,sys/arch/i386/stand/boot -2011/05/20 22:29:55,sys/arch/i386/stand/cdboot -2011/09/21 18:15:59,sys/arch/i386/stand/mbr -2011/11/28 07:56:54,sys/arch/i386/stand/lib 2012/01/16 18:47:57,sys/lib/libsa 2011/10/30 00:28:57,sys/lib/libz diff --git a/usr.bin/Makefile b/usr.bin/Makefile index 9674cd9ac..6528d9318 100644 --- a/usr.bin/Makefile +++ b/usr.bin/Makefile @@ -3,7 +3,7 @@ .include # NetBSD imports -SUBDIR= indent m4 stat tic sed mkdep uniq seq du man mdocml \ +SUBDIR= indent m4 stat tic sed mkdep uniq seq du man \ apropos chpass newgrp passwd bzip2 bzip2recover gzip # Non-NetBSD imports