#!/usr/pkg/bin/perl
# -*- mode: perl; -*-

# Copyright (c) 2003--2006 BBN Technologies Corp.  All rights reserved.
# Copyright (c) 2014 Raytheon BBN Technologies.  All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
# 3. Neither the name of BBN Technologies nor the names of its contributors
#    may be used to endorse or promote products derived from this software
#    without specific prior written permission.

# THIS SOFTWARE IS PROVIDED BY BBN TECHNOLOGIES AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL BBN TECHNOLOGIES OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.

# \todo move the bulk of this to a man page
my $usage = "Usage: $0 [option|command...]

Options:

    Information:

    -h, --help
        Show this usage message on the standard output and exit
        successfully (ignore all other options and commands).

    -V, --version
        Show version information on the standard output and exit
        successfully (ignore all other options and commands).

    Command-specific options:

    --md5 <hash>
        When adding a file to the database via the --add command,
        store <hash> instead of the calculated hash value of the
        specified file.  This may only be specified after an --add
        command, and may not be specified more than once after each
        --add command.  If <hash> is the string 'manual' (without the
        quotes), the file will be marked as manually managed (see the
        --manual command).

    Options that change the behavior of subsequent commands:

    --db <file>
        Default: $dbfile
        Use <file> as the pathname to the database of managed files.

    --destdir <dir>
        Default: the empty string.
        For every operation involving a file or directory, prefix the
        pathname with <dir> before performing the operation.  For
        example, assuming the --db option is not specified, the
        pathname of the database to use becomes:
            <dir>$dbfile
        instead of:
            $dbfile

    --no-safety-checks
        When a safety check fails, warn rather than exit the script
        with an error.

Commands:

    --add <file>
        Add an entry for the file with pathname <file> to the
        database, or update an existing entry.  If the --md5 option is
        not used, calculate the hash of the named file and store that
        in the database entry.  If the --md5 option is used, store the
        given hash value instead.

    --check
        For each file recorded in the database, test to see if the
        file exists and is a regular file in the live file system.  If
        not, print a message to the standard output and remove the
        entry from the database.

    --clear
        Remove all entries from the database.

    --generate-manifest <dir>
        Print a manifest for the files in <dir> to the standard
        output.  Non-regular files are excluded from the manifest.  A
        manifest is a series of text lines:
          * lines that begin with a '#' character are comment lines
            and are ignored by the --import-manifest command
          * each other line is a pathname followed by a space followed
            by the hash of the named file

    --import-manifest
        Read a manifest provided to the standard input.  For each
        non-comment line, if the named file exists in the filesystem,
        does not already have an entry in the database, and the hash
        of the named file matches the hash in the manifest line, the
        file is added to the database.

    --list <file>
        If the database contains an entry for <file>, print a line of
        the format:
            <file> <recorded_hash>
        Otherwise, print a line of the format:
            NOT <file>

    --manual <file>
        Equivalent to '--add <file> --md5 manual'.  This adds an entry
        for <file> to the database with a special hash value that
        indicates that the file is manually managed.  Manually managed
        files are never modified by etcmanage.

    --print
        Dump the contents of the database to standard output in
        manifest format.

    --remove <file>
        If present, remove the entry for <file> from the database.

    --unmanaged
        For each regular file in /etc, /dev/MAKEDEV, and
        /dev/MAKEDEV.local, print the file's pathname if the file is
        not in the database or is marked as manually managed.

    --update <dir>
        Update the configuration files with the latest upstream
        versions in <dir>:

          * For each pathname <file> in the database, if the hash of
            the file named <file> matches the hash stored in the
            database:
              - If the file named <dir><file> exists, copy it to
                <file> and update the hash stored in the database.
              - Otherwise, delete <file> and remove the entry from the
                database.

          * For each file in <dir>, let <file> be the pathname of the
            file with the leading <dir> removed.  If an entry for
            <file> does not exist in the database and a file named
            <file> does not exist, copy <dir><file> to <file> and add
            <file> to the database.

        For each file processed, a line of the form '<message> <file>'
        is logged to the standard output.  The following is a list of
        possible values for <message>, along with the condition that
        causes <message> to be printed and the action that is taken:

          * NEW
              - condition:
                  * <file> is not in the database
                  * <file> does not exist
                  * <dir><file> exists
              - action: copy <dir><file> to <file> and add an entry to
                the database for <file>

          * MANUAL_DELETED;UPSTREAM_WITHDRAWN
              - condition:
                  * <file> is marked as manually managed
                  * <file> does not exist
                  * <dir><file> does not exist
              - action: none

          * MANUAL_DELETED
              - condition:
                  * <file> is marked as manually managed
                  * <file> does not exist
                  * <dir><file> exists
              - action: none

          * MISSING;UPSTREAM_WITHDRAWN
              - condition:
                  * <file> is marked as automatically managed
                  * <file> does not exist
                  * <dir><file> does not exist
              - action: none

          * MISSING
              - condition:
                  * <file> is marked as automatically managed
                  * <file> does not exist
                  * <dir><file> exists with a hash that matches the
                    database entry
              - action: none

          * MISSING;UPSTREAM_DIFFERENT
              - condition:
                  * <file> is marked as automatically managed
                  * <file> does not exist
                  * <dir><file> exists with a hash that differs from
                    the database entry
              - action: none

          * UNMANAGED_EQ_UPSTREAM
              - condition:
                  * <file> is not in the database
                  * <file> exists
                  * <dir><file> exists with the same hash as <file>
              - action: none

          * UNMANAGED_NEQ_UPSTREAM
              - condition:
                  * <file> is not in the database
                  * <file> exists
                  * <dir><file> exists with a hash that differs from
                    the hash of <file>
              - action: none

          * MANUAL;UPSTREAM_WITHDRAWN
              - condition:
                  * <file> is marked as manually managed
                  * <file> exists
                  * <dir><file> does not exist
              - action: none


          * MANUAL_EQ_UPSTREAM
              - condition:
                  * <file> is marked as manually managed
                  * <file> exists
                  * <dir><file> exists with the same hash as <file>
              - action: none

          * MANUAL
              - condition:
                  * <file> is marked as manually managed
                  * <file> exists
                  * <dir><file> exists with a hash that differs from
                    the hash of <file>
              - action: none

          * DELETED
              - condition:
                  * <file> is marked as automatically managed
                  * <file> exists with a hash that matches the
                    database
                  * <dir><file> does not exist
              - action: delete <file> and remove the entry for <file>
                from the database

          * UPDATED
              - condition:
                  * <file> is marked as automatically managed
                  * <file> exists with a hash that matches the
                    database
                  * <dir><file> exists with a different hash
              - action: copy <dir><file> to <file> and update the hash
                stored in the database

          * MODIFIED;UPSTREAM_WITHDRAWN
              - condition:
                  * <file> is marked as automatically managed
                  * <file> exists with a hash that differs from the
                    hash in the database
                  * <dir><file> does not exist
              - action: none

          * MODIFIED_EQ_UPSTREAM
              - condition:
                  * <file> is marked as automatically managed
                  * <file> exists with a hash that differs from the
                    hash in the database
                  * <dir><file> exists with the same hash as <file>
              - action: none

          * MODIFIED
              - condition:
                  * <file> is marked as automatically managed
                  * <file> exists with a hash that differs from the
                    hash in the database
                  * <dir><file> exists with a hash that matches the
                    hash in the database
              - action: none

          * MODIFIED;UPSTREAM_DIFFERENT
              - condition:
                  * <file> is marked as automatically managed
                  * <file> exists with a hash that differs from the
                    hash in the database
                  * <dir><file> exists with a hash that differs from
                    both the hash of <file> and the hash stored in the
                    database
              - action: none
";

use strict;
use warnings;
use AnyDBM_File;
use DB_File;
use Digest::MD5;
use Getopt::Long;
use Fcntl qw(:mode);
use File::Basename qw(dirname);
use File::Copy;
use File::Find;
use File::Path qw(make_path);
use File::Spec::Functions qw(:ALL);
use File::stat;

my $dbfile = "/var/db/etcmanage.db";
my %db;
my $DESTDIR = "";
my $safety_checks_enabled = 1;
# where to write messages that are more urgent than ordinary log
# messages but don't rise to the level of error messages.  this is
# hard-coded to STDOUT for now to facilitate automated processing of
# '--update' output.  \todo consider adding a command-line option that
# changes this to STDERR
my $notice_fh = \*STDOUT;

######################################################################
### logging and error handling utility functions

sub errstr { my $pfx = shift(); return join("", map("$pfx: $_\n", @_)); }
sub warning { warn(errstr("WARNING", @_)); }
sub error { warn(errstr("ERROR", @_)); }
sub fatal { die(errstr("ERROR", @_)); }
sub safety_error { ($safety_checks_enabled ? \&error : \&warning)->(@_); }
sub safety_fatal { ($safety_checks_enabled ? \&fatal : \&warning)->(@_); }

# substitute whitespace for the equivalent perl escape sequence (for
# safer logging of abnormal strings)
sub escape {
    my %repl = ("\f" => "\\f", "\n" => "\\n", "\r" => "\\r", "\t" => "\\t");
    my $re = qr/[${[join("", keys(%repl))]}[0]]/;
    return $_[0] =~ s/($re)/$repl{$1}/gr
}

sub check_md5 {
    my $md5 = shift();
    # if the md5 contains whitespace or is the empty string, a
    # printed manifest will not be importable (whitespace is a
    # field separator, and exactly two fields must be present)
    ($md5 !~ /\s/) or safety_fatal("checksum contains whitespace: "
				   . escape($md5));
    ($md5 ne "") or safety_fatal("checksum is the empty string");
}

######################################################################
### pathname handling utility functions

# returns true iff given pathname is an absolute path
sub path_abs_p { return file_name_is_absolute($_[0]); }

# returns true iff given pathname does not have '..' as a path component
sub path_noupdir_p { return (grep($_ eq updir(), splitdir($_[0])) == 0); }

# perform a bunch of pathname sanity checks on $_[0] and call a
# function for each check that fails.  Arguments:
#   0. the pathname to check
#   1. coderef to a function to call (with an error message as the
#      only argument) if the pathname is relative, contains '/../', or
#      has extra slashes and/or '.' components.
#      Default: \&safety_fatal.
#   2. coderef to a function to call (with an error message as the
#      only argument) if the pathname contains whitespace, begins with
#      '#', or is the empty string.  (These conditions would result in
#      a bad manifest if printed.)
#      Default: the coderef used for argument 1
sub check_path {
    my $f = shift();
    my $abnormalfn = shift() || \&safety_fatal;
    my $badmanifest = shift() || $abnormalfn;

    ($f !~ /\s/)
	or $badmanifest->("pathname contains whitespace: " . escape($f));
    ($f !~ /^#/)
	or $badmanifest->("pathname begins with '#': $f");
    ($f ne "")
	or $badmanifest->("pathname is the empty string");

    path_abs_p($f)
	or $abnormalfn->("pathname is relative: $f");
    path_noupdir_p($f)
	or $abnormalfn->("pathname contains '" . updir() . "': $f");
    (canonpath($f) eq $f)
	or $abnormalfn->(
	    "pathname contains superfluous slashes or '.' components: $f");
}

# Prepend the last argument with the path concatenation of the
# previous arguments.  This is like File::Spec->catfile() but that
# function is broken:  If given a directory and an absolute pathname,
# catfile() will return a string with multiple slashes.  For example:
#
#     File::Spec->catfile("/foo", "/bar", "/baz") -> "/foo/bar//baz"
#
# It should either ignore all previous arguments if an argument is
# absolute, or it should elide the extra slashes.  This function
# elides the slashes.
#
sub prefixpath {
    return canonpath(catfile(@_));
}

# returns $_[1] as an absolute path relative to $_[0] (i.e., the
# pathname of the file identified by $_[1] as if it was examined
# inside a chroot at $_[0])
sub strippath {
    my $base = shift() || rootdir();
    my $p = shift();
    return canonpath(catfile(rootdir(), abs2rel($p, $base)));
}

# returns $_[0] appended to $DESTDIR after performing some sanity
# checks
sub destpath {
    my $f = shift();
    path_abs_p($f) or safety_fatal("pathname is relative: $f");
    path_noupdir_p($f)
	or safety_fatal("pathname contains '" . updir() . "': $f");
    return prefixpath($DESTDIR, $f);
}

# reverses destpath()
sub undestpath {
    return strippath($DESTDIR, @_);
}

######################################################################

# cp_rfp(src, dst, [base])
#
# copy a file to a destination, preserving file mode bits (except
# S_ISVTX), uid, gid, mtime, and atime.  if base is specified, src is
# a path relative to base and the destination is the concatenation of
# dst with src and any missing directory in the destination pathname
# are created preserving the source directory's properties.
#
# this function is similar to 'cp -rfp' except only one file is
# copied, not the whole file hierarchy.
#
# this function creates the directories with default ownership and
# conservative permissions, then copies the file, then syncs the
# ownership and permissions of the source file and relevant parent
# directories (if any) in a depth-first manner.  it is done this way
# for several reasons:
#   * it matches the behavior of 'cp -rp'
#   * it ensures that atimes and mtimes are accurate
#   * it reduces (perhaps eliminates?) an attacker's ability to
#     exploit the time between the various file/directory operations
sub cp_rfp {
    my ($src, $dst, $base) = @_;
    my $dstbase;
    if (defined($base)) {
	$dstbase = $dst;
	$base = canonpath($base);
	(-d $dst) or fatal("destination directory $dst does not exist");
	(-d $base) or fatal("base directory $base does not exist");
	$dst = prefixpath($dst, $src);
	$src = prefixpath($base, $src);
    }
    unlink($dst) or fatal("unable to delete $dst: $!")
	if ((-e $dst) || (-l $dst));
    open(my $src_fh, "<", $src)
	or fatal("unable to open source file $src: $!");
    binmode($src_fh) or fatal("failed to open source in binary mode: $!");
    my $src_st = stat($src_fh);
    my @created = ();
    if (defined($base)) {
	# construct parent directories as needed, preserving permissions
	my $srcdir = $base;
	my $dstdir = $dstbase;
	foreach (splitdir(abs2rel(dirname($src), $base))) {
	    $srcdir = catdir($srcdir, $_);
	    $dstdir = catdir($dstdir, $_);
	    (-d $srcdir) or fatal("assertion error; dir $srcdir missing");
	    !(-l $srcdir) or fatal("symbolic links in source dir ($srcdir) not supported");
	    (-d $dstdir) and next;
	    my $st = stat($srcdir);

	    # POSIX spec for 'cp -rp' says that the mode of the new
	    # directory should be unmodified by umask, but that seems
	    # less safe (what if running as root and the source dir is
	    # owned by a sketchy user and has mode a=rwx?) and it
	    # isn't what NetBSD's cp does
	    #
	    # note: POSIX doesn't provide a way to atomically create a
	    # directory AND get an open file descriptor, so it is
	    # opened with O_DIRECTORY and O_NOFOLLOW right after
	    # creation to ensure that someone didn't replace the
	    # directory with a script or symlink between operations.
	    # the file descriptor will be used to chown, chmod, and
	    # utime the directory after the child directories and
	    # config file have been created.
	    #
	    # todo: this should really use the POSIX Issue 7 mkdirat()
	    # interface to reduce (but not eliminate?) the chance of
	    # vulnerability via races.  unfortunately, an equivalent
	    # Perl function isn't available and that C function isn't
	    # even available in NetBSD 6.1
	    mkdir($dstdir, ($st->mode & 0777) | S_IRWXU)
		or fatal("failed to create directory $dstdir: $!");
	    # can an attacker take advantage of time between mkdir
	    # above and sysopen below?
	    sysopen(my $dfh, $dstdir, O_RDONLY | O_DIRECTORY | O_NOFOLLOW)
		or fatal("failed to open directory $dstdir: $!");
	    # save the fd and stat info for later permissions changing
	    push(@created, [$dstdir, $dfh, $st]);
	}
    }
    sysopen(my $dst_fh, $dst, O_WRONLY|O_CREAT|O_EXCL, $src_st->mode & 0777)
	or fatal("failed to open destination file $dst: $!");
    binmode($dst_fh) or fatal("failed to open destination in binary mode: $!");
    copy($src_fh, $dst_fh) or fatal("failed to copy $src to $dst: $!");
    push(@created, [$dst, $dst_fh, $src_st]);
    while (my $info = pop(@created)) {
	my ($name, $fh, $st) = @{$info};
	# chown before chmod because chown might clear set{u,g}id bits
	chown($st->uid, $st->gid, $fh)
	    or fatal("failed to change ownership of $name: $!");
	chmod($st->mode & (0777 | S_ISUID | S_ISGID), $fh)
	    or fatal("failed to change permissions of $name: $!");
	# set utimes last because other operations might update atime
	utime($st->atime, $st->mtime, $fh)
	    or fatal("failed to change utimes of $name: $!");
	close($fh) or warning("failed to close $name: $!");
    }
}

# given a pathname, return the md5 sum (no space, no newline)
sub md5file {
    my ($f) = @_;
    my $df = destpath($f);
    (-f $df) or fatal("can't get checksum of $f: not a regular file");
    open(my $fh, "<", $df) or fatal("unable to open $f for reading: $!");
    binmode($fh);
    return Digest::MD5->new->addfile($fh)->hexdigest;
}

# check the db
#
# XXX why does this delete entries from the db?  the word 'check'
# implies a non-mutating action...
sub db_check {
    open_db();
    foreach my $fetc (keys(%db)) {
	check_path($fetc, \&warning);
	# should not happen <- XXX why shouldn't it happen?
	if (! -f destpath($fetc)) {
	    # XXX why is there a '#' character here?
	    print $notice_fh "#MISSING $fetc\n";
	    delete $db{$fetc};
	}
    }
}

# print the database in manifest format
sub db_print {
    open_db();
    keys(%db); # reset the iterator for safety
    while (my ($fetc, $md5) = each %db) {
	check_path($fetc, \&warning, \&safety_fatal);
	check_md5($md5);
	print "$fetc $md5\n";
    }
}

# clear the db
sub db_clear {
    open_db();
    %db = ();
}

# list the given file
sub db_list {
    my ($f) = @_;
    check_path($f, \&warning);
    open_db();
    if ($db{$f}) {
	print "$f $db{$f}\n";
    } else {
	print "NOT $f\n";
    }
}

sub db_manual {
    db_add_md5($_[0], "manual");
}

# print non-managed files
sub db_unmanaged {
    open_db();
    find(
	{
	    wanted => sub {
		# skip anything that is not a regular file
		-f || return;

		my $f = undestpath($File::Find::name);
		if (! $db{$f} or $db{$f} eq "manual") {
		    print "$f\n";
		}
	    },
	    no_chdir => 1,
	},
	map(destpath($_),
	    "/etc",
	    "/dev/MAKEDEV",
	    "/dev/MAKEDEV.local",
	));
}

# add/replace the given file with its current md5sum
sub db_add {
    my ($f) = @_;
    my $md5 = md5file($f);
    db_add_md5($f, $md5);
}

# add the given file with a specific md5sum
sub db_add_md5 {
    my ($f, $md5) = @_;
    check_md5($md5);
    open_db();
    check_path($f, exists($db{$f}) ? \&warning : undef);
    $db{$f} = $md5;
}

# remove the given file
sub db_remove {
    my ($f) = @_;
    check_path($f, \&warning);
    open_db();
    delete $db{$f};
}

# print out a manifest for a tree at a given upstream dir
sub generate_manifest {
    # generate does not require database access
    my ($upstream) = @_;

    (-d destpath($upstream))
	or fatal("generate-manifest: invalid directory: " . $upstream);

    !(-e prefixpath(destpath($upstream), $_))
	or safety_fatal("suspicious file $_ in $upstream")
	foreach(
	    "/sbin/init",
	    "/bin/ls",
	);

    print "# manifest for $upstream\n";
    find(
	{
	    wanted => sub {
		# skip anything that is not a regular file
		-f || return;

		my $f = undestpath($File::Find::name);
		# get the md5 using the whole pathname
		my $md5 = md5file($f);
		# strip off the upstream prefix
		$f = strippath($upstream, $f);

		# it's fairly safe to assume that $f isn't relative;
		# doesn't have "..", ".", or extra slashes; and isn't
		# the empty string.  pass it through check_path()
		# anyway because it could contain whitespace, which
		# would produce an invalid manifest
		check_path($f, \&warning, \&safety_fatal);

		print "$f $md5\n";
	    },
	    no_chdir => 1,
	},
	destpath($upstream));
}

sub import_manifest {
    open_db();
    # read in the manifest
    while (<>) {
	if (/^#/) {
	     next;
	}
	my ($f, $md5) = split(' ');
	check_path($f, exists($db{$f}) ? \&warning : undef);
	# print "FILE $f MD5 $md5\n";

	# if not in db and file exists md5 matches, add it
	if (! $db{$f} && -f destpath($f)) {
	    if ($md5 eq md5file($f)) {
		print("ADDING $f $md5\n");
		db_add_md5($f, $md5);
	    } else {
		print $notice_fh "MISMATCH $f\n";
	    }
	}
    }
}

# For each file there are 20 different update scenarios that must be
# handled:
#
#   #   Before    After     FD        Message
#   ===================================================================
#   1.  (-,-,-)  (-,-,-)
#   2.  (-,-,X)  (X,X,X)  STDOUT      NEW
#   3.  (-,m,-)  (-,m,-)  STDOUT      MANUAL_DELETED;UPSTREAM_WITHDRAWN
#   4.  (-,m,X)  (-,m,X)  STDOUT      MANUAL_DELETED
#   5.  (-,X,-)  (-,X,-)  $notice_fh  MISSING;UPSTREAM_WITHDRAWN
#   6.  (-,X,X)  (-,X,X)  $notice_fh  MISSING
#   7.  (-,X,Y)  (-,X,Y)  $notice_fh  MISSING;UPSTREAM_DIFFERENT
#   8.  (X,-,-)  (X,-,-)
#   9.  (X,-,X)  (X,-,X)  $notice_fh  UNMANAGED_EQ_UPSTREAM
#   10. (X,-,Y)  (X,-,Y)  $notice_fh  UNMANAGED_NEQ_UPSTREAM
#   11. (X,m,-)  (X,m,-)  STDOUT      MANUAL;UPSTREAM_WITHDRAWN
#   12. (X,m,X)  (X,m,X)  STDOUT      MANUAL_EQ_UPSTREAM
#   13. (X,m,Y)  (X,m,Y)  STDOUT      MANUAL
#   14. (X,X,-)  (-,-,-)  STDOUT      DELETED
#   15. (X,X,X)  (X,X,X)
#   16. (X,X,Y)  (Y,Y,Y)  STDOUT      UPDATED
#   17. (X,Y,-)  (X,Y,-)  $notice_fh  MODIFIED;UPSTREAM_WITHDRAWN
#   18. (X,Y,X)  (X,Y,X)  $notice_fh  MODIFIED_EQ_UPSTREAM
#   19. (X,Y,Y)  (X,Y,Y)  $notice_fh  MODIFIED
#   20. (X,Y,Z)  (X,Y,Z)  $notice_fh  MODIFIED;UPSTREAM_DIFFERENT
#
# How to read the above table:
#
#   * Before:  Describes the state of a config file before running
#     'etcmanage --update'.  The first of the triple is the state of
#     the "live" file, the second is the state of the database entry,
#     and the third is the state of the upstream file.
#
#     Possible states:
#       * X, Y, Z:  data with hash X, Y, Z
#       * m:  marked as manually managed (applies to db record only)
#       * -:  doesn't exist (no file or no db record)
#
#   * After:  Same as 'Before', but describes the expected state after
#     'etcmanage --update' is done.
#
#   * FD:  Where the log message for this file should be written.
#
#   * Message:  The string to log (along with the filename) when
#     encountering this scenario.
#
# Logic behind the 'after' states:  If the state of the live file
# matches the state of the database (either neither exist or both
# exist with the same hash), then both the live file and database are
# updated to match the state of the upstream file.  Otherwise nothing
# is modified.
#
# Logic behind the STDOUT/$notice_fh choices:  If --update wants to
# update something (db and/or live file) but can't due to live/db
# mismatch, it goes to $notice_fh.  Otherwise it goes to STDOUT.
#
# If a message is written to $notice_fh, the admin should take one of
# the following actions to keep things "clean":
#   * mark the file as manual maintained
#   * modify the file to match upstream
#   * use --add to update the stored db hash
#
sub db_update {
    my ($upstream) = @_;
    my @to_add = ();
    my @to_update = ();
    my @to_remove = ();
    my @withdrawn = ();
    my $num_manual = 0;

    (-d destpath($upstream))
	or fatal("update: invalid directory: $upstream");

    # Walk over the database and filesystem to collect information
    # first.  Do not take action yet for a few reasons:
    #   * Modifying the db while iterating is not safe.  It should be
    #     safe to delete the current item while iterating, but NetBSD
    #     seems to have problems with it.  see:
    #     https://stackoverflow.com/questions/23417074
    #   * Queueing modifications makes it possible to perform sanity
    #     checks that can abort the script before any modifications
    #     are made.
    #   * Mutating actions and their associated log output can be
    #     sorted for added user-friendliness.
    open_db();
    keys(%db); # reset the iterator for safety
    while (my ($fetc, $md5) = each %db) {
	check_path($fetc, \&warning);

	my $fnew = prefixpath($upstream, $fetc);
	my $fetc_md5;
	my $fnew_md5;

	# XXX ignore this symlink as a special case
	if ($fetc eq "/etc/rmt") {
	    next;
	}

	$fetc_md5 = md5file($fetc) if (-f destpath($fetc));
	$fnew_md5 = md5file($fnew) if (-f destpath($fnew));

	++$num_manual if ($md5 eq "manual");
	if (!defined($fnew_md5) && ($md5 ne "manual")) {
	    push(@withdrawn, $fetc);
	}

	if (!defined($fetc_md5)) {
	    # scenarios 3-7: -,[mX],*
	    if ($md5 eq "manual") {
		# scenarios 3-4: (-,m,*)
		if (!defined($fnew_md5)) {
		    # scenario 3: (-,m,-)
		    print "MANUAL_DELETED;UPSTREAM_WITHDRAWN $fetc\n";
		} else {
		    # scenario 4: (-,m,X)
		    print "MANUAL_DELETED $fetc\n";
		}
	    } elsif (!defined($fnew_md5)) {
		# scenario 5: (-,X,-)
		print $notice_fh "MISSING;UPSTREAM_WITHDRAWN $fetc\n";
		# both live and upstream were deleted.  probably the
		# admin did something like cherry-pick an upstream
		# change, so updating the db is arguably the most
		# convenient behavior.  however, to keep the update
		# algorithm easy for the user to understand, the
		# policy is to not touch anything if the state of the
		# live file doesn't match the database.
	    } elsif ($md5 eq $fnew_md5) {
		# scenario 6: (-,X,X)
		print $notice_fh "MISSING $fetc\n";
	    } else {
		# scenario 7: (-,X,Y)
		print $notice_fh "MISSING;UPSTREAM_DIFFERENT $fetc\n";
	    }
	} elsif ($md5 eq "manual") {
	    # scenarios 11-13: (X,m,*)
	    if (!defined($fnew_md5)) {
		# scenario 11: (X,m,-)
		print "MANUAL;UPSTREAM_WITHDRAWN $fetc\n";
	    } elsif ($fetc_md5 eq $fnew_md5) {
		# scenario 12: (X,m,X)
		print "MANUAL_EQ_UPSTREAM $fetc\n";
	    } else {
		# scenario 13: (X,m,Y)
		print "MANUAL $fetc\n";
	    }
	} elsif ($fetc_md5 eq $md5) {
	    # scenarios 14-16: (X,X,*)
	    if (!defined($fnew_md5)) {
		# scenario 14: (X,X,-)
		push(@to_remove, $fetc);
	    } elsif ($fetc_md5 eq $fnew_md5) {
		# scenario 15: (X,X,X)
		#print "SAME $fetc\n";
	    } else {
		# scenario 16: (X,X,Y)
		push(@to_update, [$fetc, $fnew_md5]);
	    }
	} elsif (!defined($fnew_md5)) {
	    # scenario 17: (X,Y,-)
	    print $notice_fh "MODIFIED;UPSTREAM_WITHDRAWN $fetc\n";
	} elsif ($fetc_md5 eq $fnew_md5) {
	    # scenario 18: (X,Y,X)
	    print $notice_fh "MODIFIED_EQ_UPSTREAM $fetc\n";
	    # both live and upstream were modified in the same way.
	    # probably the admin did something like cherry-pick an
	    # upstream change, so updating the db is arguably the most
	    # convenient behavior.  however, to keep the update
	    # algorithm easy for the user to understand, the policy is
	    # to not touch anything if the state of the live file
	    # doesn't match the database.
	} elsif ($md5 eq $fnew_md5) {
	    # scenario 19: (X,Y,Y)
	    print $notice_fh "MODIFIED $fetc\n";
	} else {
	    # scenario 20, (X,Y,Z)
	    print $notice_fh "MODIFIED;UPSTREAM_DIFFERENT $fetc\n";
	}
    }

    # now, walk the new tree, queueing adds for stuff that doesn't
    # conflict
    find(
	{
	    wanted => sub {
		-f || return;

		my $fnew = undestpath($File::Find::name);
		my $fetc = strippath($upstream, $fnew);

		# if the file is in the database it has already been
		# handled by db_update so simply return.  this
		# prevents update from silently re-creating
		# purposely-deleted files.
		return if exists($db{$fetc});

		my $md5 = md5file($fnew);

		if (! -f destpath($fetc)) {
		    # scenario 2: (-,-,X)
		    push(@to_add, [$fetc, $md5]);
		} elsif (md5file($fetc) eq $md5) {
		    # scenario 9: (X,-,X)
		    print $notice_fh "UNMANAGED_EQ_UPSTREAM $fetc\n";
		} else {
		    # scenario 10: (X,-,Y)
		    print $notice_fh "UNMANAGED_NEQ_UPSTREAM $fetc\n";
		}
	    },
	    no_chdir => 1,
	},
	destpath($upstream));

    ## sanity checks
    my $sane = 1;
    my $num_managed = keys(%db) - $num_manual;
    # check for too many new files
    if ((@to_add >= 10) && (@to_add >= $num_managed)) {
	safety_error(
	    "Suspiciously large number of new upstream config files."
	    . "  Was the correct directory specified?",
	    "New upstream config files:",
	    map("    $_->[0]", @to_add));
	$sane = 0;
    }
    # check for too many withdrawn upstream files
    if ((@withdrawn >= 10) && (@withdrawn / $num_managed >= 0.25)) {
	safety_error(
	    "Suspiciously large number of withdrawn upstream config files."
	    . "  Was the correct directory specified?",
	    "Withdrawn upstream config files:",
	    map("    $_", @withdrawn));
	$sane = 0;
    }
    # exit if a sanity check failed
    $sane or safety_fatal("Potentially unsafe situation detected.");

    # perform adds first followed by updates.  adds are least likely
    # to cause problems for the user, so it's more OK for the NEW
    # messages to scroll off the screen.  updates are riskier, but not
    # as risky as deletes.
    foreach (["NEW", \@to_add], ["UPDATED", \@to_update]) {
	my $token = $_->[0];
	foreach (@{$_->[1]}) {
	    my ($fetc, $md5) = @{$_};
	    my $fnew = prefixpath($upstream, $fetc);
	    cp_rfp($fetc, destpath(rootdir()), destpath($upstream));
	    db_add_md5(@{$_});
	    print("$token $fetc\n");
	}
    }

    # perform deletes last.  these are potentially most dangerous, so
    # deleting last gives the user more time to hit Ctrl-C and
    # increases the likelihood the messages will be noticed (less
    # likely to scroll off the screen)
    foreach (@to_remove) {
	print("DELETED $_\n");
	unlink(destpath($_)) or fatal("failed to remove $_: $!");
	db_remove($_);
    }
}


# open the database
my $db_opened = 0;
sub open_db {
    return if ($db_opened);
    make_path(dirname(destpath($dbfile)));
    tie(%db, 'DB_File', destpath($dbfile), O_RDWR|O_CREAT, 0644)
	|| fatal("unable to open database file $dbfile");
    $db_opened = 1;
}

sub close_db {
    return if (!$db_opened);
    untie %db;
    $db_opened = 0;
}

######################################################################
## command-line argument processing

# support for '--add $f1 --md5 $h1 --add $f2 --add $f3 --md5 $h3':
# when --add is processed, undef is pushed onto this array.  when
# --md5 is processed, the last element of this array is replaced with
# the value associated with the --md5 argument.  thus, '--add foo
# --add bar --md5 <hash>' causes @md5_queue to look like:
#     @md5_queue = (undef, "<hash>");
# which results in the following behavior:
#     db_add("foo");
#     db_add_md5("bar", "<hash>");
my @md5_queue = ();

# dispatch table
my %commands = (
    "add=s" => sub {
	my $md5 = shift(@md5_queue);
	if (defined($md5)) {
	    db_add_md5($_[0], $md5);
	} else {
	    db_add($_[0]);
	}
    },
    "check" => \&db_check,
    "clear" => \&db_clear,
    "generate-manifest=s" => \&generate_manifest,
    "import-manifest" => \&import_manifest,
    "list=s" => \&db_list,
    "manual=s" => \&db_manual,
    "print" => \&db_print,
    "remove=s" => \&db_remove,
    "unmanaged" => \&db_unmanaged,
    "update=s" => \&db_update,
    );

# queue commands rather than execute the functions directly because
# the user must specify --md5 after --add
my @cmd_queue = ();
sub enqueue_cmd {
    my ($name, $value) = @_;
    my $closure;
    if (exists($commands{$name})) {
	$closure = sub { &{$commands{$name}}(); };
    } elsif (exists($commands{$name . "=s"})) {
	$closure = sub { &{$commands{$name . "=s"}}($value); };
    } else {
	fatal("unhandled command: $name");
    }
    if ($name eq "add") { push(@md5_queue, undef); }
    push(@cmd_queue, $closure);
}

GetOptions(
    "db=s" => sub {
	# push a closure onto @cmd_queue to ensure that --db only
	# affects subsequent commands
	my $f = $_[1];
	push(@cmd_queue, sub { $dbfile = $f; });
    },
    "destdir=s" => sub {
	my $d = $_[1];
	$d = "" if ($d eq "/");
	($d eq "") || (-d $d) or fatal("$_[0]: invalid directory: $d");
	push(@cmd_queue, sub { close_db(); $DESTDIR = $d; });
    },
    "help|h" => sub { print($usage); exit(0); },
    "md5=s" => sub {
	(@md5_queue > 0) or fatal("--md5 must be specified after --add");
	$md5_queue[$#md5_queue] = $_[1];
    },
    "no-safety-checks" => sub {
	push(@cmd_queue, sub { $safety_checks_enabled = 0; });
    },
    "version|V" => sub {
	print(<<\EOF);
etcmanage 0.9.4
EOF
	exit(0);
    },
    map(($_ => \&enqueue_cmd), keys(%commands)),
    )
    or fatal("error while processing arguments");

@ARGV == 0 or fatal("unsupported argument(s) specified: @ARGV");

# run the enqueued commands
foreach (@cmd_queue) {
    $_->();
}

# flush all changes
close_db();
