#!/usr/pkg/bin/perl -w
# mknmz-wwwoffle
#
#    mknmz-wwwoffle
#
#    mknmz-wwwoffle
#
#
# Copyright (C) 2000,2001,2002 WATANABE Yoshimasa <naney@naney.org>
#
# This file is part of the mknmz-wwwoffle.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc., 59
# Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
#
# $Id: mknmz-wwwoffle.in,v 1.37 2002/03/14 12:31:17 naney Exp $
#
#

use strict;

use vars qw($VERSION);
$VERSION = '0.7.2';

package MknmzWwwoffle::UriDB;

use strict;

use vars qw($VERSION);
$VERSION = '0.7.2';

BEGIN {
  if (1) {
    eval "use DB_File;";
    die $@ if ($@);
  }
}

sub new {
  my ($class, $fileName) = @_;

  my $DB;
  my %hash;

  unless($DB =
         tie(%hash,
             'DB_File',
             $fileName,
             (O_CREAT | O_RDWR),
             0644)) {
    die "mknmz-wwwoffle: Can not open URI DB file: $fileName";
  }
  my $self = bless {DB     => $DB,
                    DBHash => \%hash}, $class;

  return $self;
}


sub DESTROY {
  my $self = shift;

  if (defined $self->{DB}) {
    util::vprint("Closing URI DB...");
    undef $self->{DB};
    untie %{$self->{DBHash}};
    util::vprint("Done.\n");
  }
}

sub existsFullFileName {
  my ($self, $fileName) = @_;

  return exists $self->{DBHash}->{$fileName};
}

sub uri {
  my ($self, $fileName) = @_;

  return $self->{DBHash}->{$fileName};
}

sub setUri {
  my ($self, $fileName, $uri) = @_;

  util::vprint("Add URI to DB: $uri\n");
  $self->{DBHash}->{$fileName} = $uri;

}

package MknmzWwwoffle::ExcludeRule;

use strict;

sub new {
  my ($class, $rule_file_name) = @_;

  my $self = bless {}, $class;

  $self->read_rule_file($rule_file_name);

  return $self;
}

sub read_rule_file {
  my ($self, $rule_file_name) = @_;

  $self->{Rule_File_Name} = $rule_file_name;

  open(RULE_FILE, $rule_file_name)
    || die "Can't open exclude rule file '$rule_file_name': $!";

  my $line_no = 0;
  for my $line (<RULE_FILE>) {
    $line_no++;
    chomp $line;
    next if $line =~ /^\s*$/;
    next if $line =~ /^\s*#/;

    $line =~ /^\s*(\S+)\s*:\s*(.*)$/;
    my $key   = $1;
    my $value = $2;

    if (!defined $key || ! $key) {
      die "$rule_file_name:$line_no:Invalid rule";
    }

    $value = '' unless defined $value;

    $value =~ s/\s+$//;

    if ($key eq 'uri') {
      if ($value eq '') {
        warn "$rule_file_name:$line_no: No URI";
        next;
      }
      else {
        $self->{Exclude_URI}->{$value}++;
      }
      next;
    }

    if ($key eq 'uri-prefix') {
      if ($value eq '') {
        warn "$rule_file_name:$line_no: No URI prefix";
        next;
      }
      else {
        push(@{$self->{Exclude_URI_Prefix}}, [$value, length($value)]);
      }
      next;
    }
    die "$rule_file_name:$line_no: Unknown line";
  }

  close(RULE_FILE);
}

sub check_uri {
  my ($self, $uri) = @_;

  if (exists $self->{Exclude_URI}->{$uri}) {
    return "uri: $uri";
  }

  for my $uri_prefix (@{$self->{Exclude_URI_Prefix}}) {
    if (substr($uri, 0, $uri_prefix->[1]) eq $uri_prefix->[0]) {
      return "uri-prefix: " . $uri_prefix->[0];
    }
  }

  return '';
}

package MknmzWwwoffle;

use strict;
use Cwd;
use File::Find;
use Getopt::Long;

use vars qw($VERSION);
$VERSION = '0.7.2';

BEGIN {
  unshift @INC, '/usr/pkg/share/namazu/pl';
  require 'gettext.pl' || die;
  require 'var.pl'     || die;
  require 'conf.pl'    || die;
  require 'util.pl'    || die;
}

sub defaultTemporaryDirectory    { '/tmp'; }
sub default_target_list_file_name_prefix
                                 { 'mknmz-wwwoffle-target-list.'; }
sub mknmz                        { '/usr/pkg/bin/mknmz'; }
sub namazuConfDir                { '/usr/pkg/etc/namazu'; }
sub mimeTypesFile                { '/etc/mime.types'; }

sub uriDBFileName                { 'MKNMZ-WWWOFFLE.uri'; }

sub defaultSkipTopLevelMediaType { return qw(image audio video); }
sub defaultSkipSuffix            { return qw(gif jpg class png mid
                                             css exe zip gz lzh mov
                                             eps swf); }

my %Default_skip_status_codes =
  ('302', 1,
   '401', 1,
   '403', 1,
   '404', 1,
   '500', 1,
   '503', 1, # include $reason phrase eq 'WWWOFFLE Remote Host Error'
  );


sub new {
  my $class = shift;

  srand(time);

  my $self = bless {
                    CacheListingDirectory => [],
                    UriDB                 => undef,
                   }, $class;

  for my $topLevelMediaType (defaultSkipTopLevelMediaType()) {
    $self->{SkipTopLevelMediaType}->{$topLevelMediaType} = 1;
  }
  for my $suffix (defaultSkipSuffix()) {
    $self->{SkipSuffix}->{$suffix} = 1;
  }

  $self->get_options();
  $self->loadRcfiles();
  $self->addSkipSuffixFromMimeTypesFile(mimeTypesFile());

  return $self;
}

sub DESTROY {
  my $self = shift;
  if (defined $self->{UriDB}) {
    undef $self->{UriDB};
  }
  $self->unlinkListingFile();
}

my @mknmzOptionDefinition =
  (
   ['0|help',               undef,                      0],
   ['1|exclude=s',          \$conf::EXCLUDE_PATH,       1],
   ['2|deny=s',             \$conf::DENY_FILE,          1],
   ['3|allow=s',            \$conf::ALLOW_FILE,         0],
   ['4|update=s',           undef,                      1],
   ['5|mhonarc',            undef,                      1],
   ['6|mtime=s',            \$var::Opt{'mtime'},        1],
   ['7|html-split',         \$var::Opt{'htmlsplit'},    1],
   ['C|show-config',        undef,                      1],
   ['E|no-edge-symbol',     \$var::Opt{'noedgesymbol'}, 1],
   ['F|target-list=s',      undef,                      0],
   ['G|no-okurigana',       \$var::Opt{'okurigana'},    1],
   ['H|no-hiragana',        \$var::Opt{'hiragana'},     1],
   ['I|include=s',          undef,                      1],
   ['K|no-symbol',          \$var::Opt{'nosymbol'},     1],
   ['M|meta',               \$var::Opt{'meta'},         1],
   ['O|output-dir=s',       undef,                      1],
   ['S|checkpoint-sub',     undef,                      1],
   ['T|template-dir=s',     undef,                      1],
   ['U|no-encode-uri',      \$var::Opt{'noencodeuri'},  0],
   ['V|verbose',            \$var::Opt{'verbose'},      1],
   ['Y|no-delete',          \$var::Opt{'nodelete'},     1],
   ['Z|no-update',          \$var::Opt{'noupdate'},     1],
   ['a|all',                undef,                      1],
   ['c|use-chasen',         undef,                      1],
   ['d|debug',              \$var::Opt{'debug'},        1],
   ['e|robots',             \$var::Opt{'robotexclude'}, 1],
   ['f|config=s',           undef,                      1],
   ['h|mailnews',           undef,                      1],
   ['k|use-kakasi',         undef,                      1],
   ['m|use-chasen-noun',    undef,                      1],
   ['q|quiet',              undef,                      1],
   ['r|replace=s',          undef,                      0],
   ['s|checkpoint',         undef,                      1],
   ['t|media-type=s',       undef,                      1],
   ['u|uuencode',           undef,                      1],
   ['v|version',            undef,                      0],
   ['x|no-heading-summary', undef,                      1],
   # for mknmz-wwwoffle
   ['list-only',            undef,                      0],
   ['exclude-rule=s',       undef,                      0],
   ['random-update=f',      undef,                      0],
   ['random-host=f',        undef,                      0],
   );

sub list_only {
  my $self = shift;

  return $self->{Options}{'list-only'}
    if defined $self->{Options}{'list-only'};
  return 0;
}

sub get_options {
  my $self = shift;

  Getopt::Long::config('bundling'); # like mknmz

  my %optionSpecification;
  for my $def (@mknmzOptionDefinition) {
    $optionSpecification{$def->[0]} = \$self->{Options}->{$def->[0]};
  }
  GetOptions(%optionSpecification) || die;

  for my $def (@mknmzOptionDefinition) {
    if (defined $self->{Options}->{$def->[0]} &&
        ref($def->[1])) {
     ${$def->[1]} = $self->{Options}->{$def->[0]};
    }
  }

  $self->outputDirectory($self->{Options}{'O|output-dir=s'})
    if defined $self->{Options}{'O|output-dir=s'};

  $self->target_list_file_name($self->{Options}{'F|target-list=s'})
    if defined $self->{Options}{'F|target-list=s'};

  if (defined $self->{Options}{'random-update=f'}) {
    $self->set_no_delete_option;
    $self->{'random-update-ratio'} = $self->{Options}{'random-update=f'};
  }
  else {
    $self->{'random-update-ratio'} = 100; # > 1
  }

  if (defined $self->{Options}{'random-host=f'}) {
    $self->set_no_delete_option;
    $self->{'random-host-ratio'} = $self->{Options}{'random-host=f'};
  }
  else {
    $self->{'random-host-ratio'} = 100; # > 1
  }
}

sub set_no_delete_option {
  my $self = shift;

  $self->{Options}{'Y|no-delete'} = 1;
  $var::Opt{'nodelete'} = 1;
}

sub loadRcfiles {
  my $self = shift;

  my @rcfileList = (namazuConfDir() . "/mknmzrc", "$ENV{'HOME'}/.mknmzrc");
  if (defined $self->{Options}{'f|config=s'}) {
    push (@rcfileList, $self->{Options}{'f|config=s'});
  }

  util::vprint(_("Reading rcfile: "));
  for my $rcfile (@rcfileList) {
    if (-f $rcfile) {
      $self->loadRcfile($rcfile);
      util::vprint(" $rcfile");
    }
  }
  util::vprint("\n");
}

sub loadRcfile {
  my ($self, $rcfile) = @_;

  do $rcfile;
}

sub run {
  my $self = shift;

  if (defined $self->{Options}->{'0|help'}) {
    $self->printHelp();
    exit 0;
  }

  if (defined $self->{Options}->{'v|version'}) {
    $self->printVersion();
    exit 0;
  }

  if (defined $self->{Options}->{'exclude-rule=s'}) {
    my $rule = MknmzWwwoffle::ExcludeRule
      ->new($self->{Options}->{'exclude-rule=s'});
    $self->{ExcludeRule} = $rule;
  }
  else {
    $self->{ExcludeRule} = undef;
  }

  @{$self->{Targets}} = @ARGV;

  $self->listingTargets();
  unless($self->list_only) {
    $self->callMknmz();
  }
  $self->unlinkListingFile();
}

sub unlinkListingFile {
  my $self = shift;

  return if $self->list_only;
  return if $self->{Options}{'F|target-list=s'};

  if (-e $self->target_list_file_name()) {
    unlink($self->target_list_file_name());
  }
}

sub listingTargets {
  my $self = shift;

  my $listingFile = $self->target_list_file_name;
  unlink($listingFile);
  open(LISTING, ">$listingFile") ||
    die "Can not open listing file: $listingFile";

  if (1) {
    $self->{UriDB} = MknmzWwwoffle::UriDB->new($self->uriDBFullFileName());
  }

  my $begin_time = time;
  if ($self->{'random-host-ratio'} >= 1) {
    for my $target (@{$self->{Targets}}) {
      find(sub { $self->list_up_file($File::Find::dir, $_)}, $target);
    }
  }
  else {
    for my $target (@{$self->{Targets}}) {
      opendir(TARGETDIR, $target) || warn "Can't opendir $target: $!";
      my @files = grep { $_ ne '.' && $_ ne '..' } readdir(TARGETDIR);
      close(TARGETDIR);
      for my $host (@files) {
        next unless -d "$target/$host";
        if ($self->{'random-host-ratio'} < rand) {
          util::vprint("Skip: $target/$host");
          util::vprint("      (random)");
          next;
        }
        find(sub { $self->list_up_file($File::Find::dir, $_)}, "$target/$host");
      }
    }
  }
  my $end_time = time;

  $self->{listing_time} = $end_time - $begin_time;

  if (defined $self->{UriDB}) {
    undef $self->{UriDB};
  }
  close(LISTING);
}


sub list_up_file {
  my ($self, $dir ,$fileName) = @_;

  unless ($fileName =~ /^U/) {
    return;
  }

  $self->{number_of_files}++;

  if ($self->{'random-update-ratio'} < 1 &&
      $self->{'random-update-ratio'} > rand) {
    util::vprint("Skip: $dir/$fileName");
    util::vprint("      (random)");
    return;
  }

  my $uri = $self->uri_file_name_to_uri($fileName);
  return unless $uri;

  if (defined $self->{ExcludeRule}) {
    my $skip = $self->{ExcludeRule}->check_uri($uri);
    if ($skip) {
      util::vprint("Skip: $uri");
      util::vprint("      ($skip)");
      return;
    }
  }
  $uri =~ m|\.([^\.]+)$|;
  if (defined $1 && exists $self->{SkipSuffix}->{lc($1)}) {
    util::vprint("Skip: $uri\n");
    return;
  }

  my $dataFileName = $fileName;
  $dataFileName =~ s/^U/D/;

  unless (open(DATA, $dataFileName)) {
    warn "Can not open data file: $dir/$dataFileName";
    return;
  }

  my $responseStatusLine = <DATA>;
  my $skip = $self->check_response_status_line($responseStatusLine);
  unless ($skip) {
    #for my $line (<DATA>) {
    my $line;
    while (defined ($line = <DATA>)) {
      last if $line =~ /^\015\012$/;
      if ($line =~ m|^Content-[Tt]ype: ([^/]+)/.*\015$|) {
        if (exists $self->{SkipTopLevelMediaType}{$1}) {
          $skip = "Top Level Media Type is $1";
          last;
        }
      }
      if ($line =~ /^Location:/) {
        $skip = "Location Field";
        last;
      }
    }
  }
  close(DATA);
  if ($skip) {
    util::vprint("Skip: $uri $dir/$dataFileName");
    util::vprint("      ($skip)");
    return;
  }
  print LISTING "$dir/$dataFileName\n";
  util::vprint("Listing: $uri $dir/$dataFileName\n");
}

sub uri_file_name_to_uri {
  my ($self, $uri_file_name) = @_;

  if (defined $self->{UriDB}) {
    if ($self->{UriDB}->existsFullFileName($uri_file_name)) {
      return $self->{UriDB}->uri($uri_file_name);
    }
    else {
      unless(open(URI, $uri_file_name)) {
        warn "Can't open URI file '$uri_file_name': $!";
        return '';
      }
      my $uri = <URI>;
      close(URI);
      $self->{UriDB}->setUri($uri_file_name, $uri);
      return $uri;
    }
  }
  else {
    unless(open(URI, $uri_file_name)) {
      warn "Can't open URI file '$uri_file_name': $!";
      return '';
    }
    my $uri = <URI>;
    close(URI);
    return $uri;
  }
}

sub check_response_status_line {
  my ($self, $response_status_line) = @_;

  return 0 unless defined $response_status_line && $response_status_line ne '';

  my $status_code;
  my $reason_phrase;

  if (($status_code, $reason_phrase) = ($response_status_line =~ m|^HTTP/\d+\.\d+ (\d\d\d) (.*)$|)) {
    if (exists $Default_skip_status_codes{$status_code}) {
      return "Status Code $status_code";
    }
    return 0;
  }
  else {
    return 0;
  }
}


sub addSkipSuffixFromMimeTypesFile {
  my ($self, $fileName) = @_;

  return  if (! -e $fileName);
  open (MIMETYPE, $fileName) || die;
  while (<MIMETYPE>) {
    next if /^\s*#/;
    next if /^\s*$/;
    m|^([^/]+)/\S+\s+(.*)$|;
    if (exists $self->{SkipTopLevelMediaType}{$1}) {
      if (defined $2) {
        my @suffixList = split(/\s+/, $2);
        map {$self->{SkipSuffix}{lc($_)} = 1} @suffixList;
      }
    }
  }
  close(MIMETYPE);
}

sub temporaryDirectory {
  my ($self, $directoryName) = @_;

  if (defined $directoryName) {
    return $self->{TemporayDirectory} = $directoryName;
  }
  else {
    if (exists $self->{TemporayDirectory}) {
      return $self->{TemporaryDirectory};
    }
    else {
      for my $environmentVariable ('TMPDIR', 'TMP', 'TEMP') {
        if (defined $ENV{$environmentVariable} &&
            $ENV{$environmentVariable}) {
          return $self->{TemporaryDirectory} = $ENV{$environmentVariable};
        }
      }
      return $self->{TemporaryDirectory} = defaultTemporaryDirectory();
    }
  }
}

sub target_list_file_name {
  my ($self, $file_name) = @_;

  if (defined $file_name) {
    return $self->{Target_List_File_Name} = $file_name;
  }
  else {
    if (exists $self->{Target_List_File_Name}) {
      return $self->{Target_List_File_Name};
    }
    else {
      return $self->{Target_List_File_Name}
        = $self->temporaryDirectory() . '/' . default_target_list_file_name_prefix()
          . $$;
    }
  }
}

sub callMknmz {
  my $self = shift;
  my $listingFile = $self->target_list_file_name();

  my @options;
  for my $def (@mknmzOptionDefinition) {
    if (defined $self->{Options}->{$def->[0]} && $def->[2]) {
      my $name = $def->[0];
      $name =~ s/^[^|]+\|//;
      if ($name =~ /^(.*)=s$/) {
        push(@options, "--$1=" . $self->{Options}->{$def->[0]});
      }
      else {
        push(@options, "--$name");
      }
    }
  }

  if (1) {
    $ENV{MKNMZ_WWWOFFLE_URI_DB} = $self->uriDBFullFileName();
  }

  my $callMknmz = mknmz() . " --allow='D.*\$' --replace='wwwofflecache::replaceCode' --no-encode-uri --target-list=$listingFile " . join(' ', @options);
  util::vprint("$callMknmz\n");
  system($callMknmz);
}

sub outputDirectory {
  my ($self, $dir) = @_;

  if (defined $dir) {
    return $self->{OutputDirectory} = $dir;
  }
  else {
    if (defined $self->{OutputDirectory}) {
      return $self->{OutputDirectory};
    }
    else {
      return $self->{OutputDirectory} = cwd();
    }
  }
}

sub uriDBFullFileName {
  my $self = shift;

  return $self->outputDirectory() . '/' . uriDBFileName();
}

sub printVersion {
  print <<__ENDSTR__;
mknmz-wwwoffle $VERSION
Written by WATANABE Yoshimasa

Copyright (C) 2000,2001,2002 WATANABE Yoshimasa
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
__ENDSTR__
}

sub printHelp {
  print <<__ENDSTR__;
mknmz-wwwoffle $VERSION

Usage: mknmz-wwwoffle [options] target...
Options:
mknmz-wwwoffle Options: [defaults in brackets after descriptions]
  --help                    print this message
  --version                 print the version of mknmz-wwwoffle

  --output-dir=DIR          set DIR to output the INDEX [.]
  --target-list=FILE
  --list-only               list up target files and stop.
  --exclude-rule=FILE       set exclude rule file.
  --random-update=NUM
  --random-host=NUM
  --verbose                 be verbose.
  --debug                   be debug mode.

mknmz Options: These options are passed to mknmz.
  --exclude=PATTERN         see mknmz manual.
  --deny=PATTERN            see mknmz manual.
  --update=INDEX            see mknmz manual.
  --mhonarc                 see mknmz manual.
  --mtime=NUM               see mknmz manual.
  --html-split              see mknmz manual.
  --show-config             see mknmz manual.
  --no-edge-symbol          see mknmz manual.
  --no-okurigana            see mknmz manual.
  --no-hiragana             see mknmz manual.
  --include=FILE            see mknmz manual.
  --no-symbol               see mknmz manual.
  --meta                    see mknmz manual.
  --checkpoint-sub          see mknmz manual.
  --template-dir=DIR        see mknmz manual.
  --no-encode-uri           see mknmz manual.
  --no-delete               see mknmz manual.
  --no-update               see mknmz manual.
  --all                     see mknmz manual.
  --use-chasen              see mknmz manual.
  --robots                  see mknmz manual.
  --config=FILE             see mknmz manual.
  --mailnews                see mknmz manual.
  --use-kakasi              see mknmz manual.
  --use-chasen-noun         see mknmz manual.
  --quiet                   see mknmz manual.
  --checkpoint              see mknmz manual.
  --media-type=MTYPE        see mknmz manual.
  --uuencode                see mknmz manual.
  --no-heading-summary      see mknmz manual.
__ENDSTR__
}

package main;
use Benchmark;

sub main {
  my $start_time = new Benchmark;
  my $app = MknmzWwwoffle->new();
  $app->run();
  my $end_time = new Benchmark;
  my $diff_time = timediff($end_time, $start_time);
  #print "$0\n";
  #print "Total Files: ", $app->{number_of_files}, "\n";
  #print  "Listing Time    : ", $app->{listing_time}, "s\n";
  #printf("Listing File/Sec: %f\n", $app->{number_of_files} / $app->{listing_time});
  #printf("File/Sec: %f\n", $app->{number_of_files} / ${@$diff_time}[0]);
  #print timestr($diff_time), "\n";

  return 0;
}

main();

# for emacs
# Local Variables:
# mode: cperl
# End:
