#!/usr/pkg/bin/perl -w

=pod

=head1 NAME

tv_remove_some_overlapping - Remove some overlapping programmes from XMLTV data.

=head1 SYNOPSIS

tv_remove_some_overlapping [--help] [--output FILE] [FILE...]

=head1 DESCRIPTION

Read one or more XMLTV files and write a file to standard output
containing the same data, except that some 'magazine' programmes which
seem to contain two or more other programmes are removed.

For example, if 'Schools TV' runs from 10:00 to 12:00, and there are
two programmes 'History' from 10:00 to 11:00 and 'Geography' from
11:00 to 12:00 on the same channel, then 'Schools TV' could be
removed.  A programme is removed only if there are two or more other
programmes which partition its timeslot, which implies that it and
these other programmes must have stop times specified.

To avoid throwing away any real programmes, no programme will be
discarded if it has content data other than title and URL.

Filtering this tool won't remove all overlapping programmes but it
will deal with the 'big magazine programme containing smaller
programmes' data commonly seen from listings sources.

B<--output FILE> write to FILE rather than standard output

=head1 SEE ALSO

L<xmltv(5)>.

=head1 AUTHOR

Ed Avis, ed@membled.com

=cut

use strict;
use XMLTV::Version '$Id: tv_remove_some_overlapping,v 1.7 2015/06/23 20:05:25 knowledgejunkie Exp $ ';
use XMLTV::Date;
use Getopt::Long;
use Date::Manip;

BEGIN {
    if (int(Date::Manip::DateManipVersion) >= 6) {
	Date::Manip::Date_Init("SetDate=now,UTC");
    } else {
	Date::Manip::Date_Init("TZ=UTC");
    }
}

# Use Log::TraceMessages if installed.
BEGIN {
    eval { require Log::TraceMessages };
    if ($@) {
	*t = sub {};
	*d = sub { '' };
    }
    else {
	*t = \&Log::TraceMessages::t;
	*d = \&Log::TraceMessages::d;
	Log::TraceMessages::check_argv();
    }
}

use XMLTV;
use XMLTV::Usage <<END
$0: remove some programmes which seem to be mere containers for others
usage: $0 [--help] [--output FILE] [FILE...]
END
;

# Memoize some subroutines if possible
eval { require Memoize };
unless ($@) {
    foreach (qw/Date_Cmp pd interesting/) {
	Memoize::memoize($_) or die "cannot memoize $_: $!";
    }
}

sub pd( $ );
sub exists_partition( $$$$$ );
sub interesting( $ );
sub should_write( $ );

# Keys of a programme hash which don't indicate any data we're
# especially concerned to preserve.  Poke around inside XMLTV.pm to
# find the list of attributes.
#
my %boring_programme_key = (title => 1);
$boring_programme_key{$_} = 1
  foreach map { $_->[0] } @XMLTV::Programme_Attributes;
$boring_programme_key{url} = 1; # common in some sources, and boring

my ($opt_help, $opt_output);
GetOptions('help' => \$opt_help, 'output=s' => \$opt_output) or usage(0);
usage(1) if $opt_help;
@ARGV = ('-') if not @ARGV;

my %w_args = ();
if (defined $opt_output) {
    my $fh = new IO::File ">$opt_output";
    die "cannot write to $opt_output\n" if not $fh;
    %w_args = (OUTPUT => $fh);
}

# Unfortunately we need to load the whole file before processing.  I
# don't want to require the input to be sorted, since tv_sort adds
# guessed stop times which could cause this program to remove too many
# programmes.  This is another reason to eventually change tv_sort not
# to add stop times (move into tv_guess_stop_times or whatever).
#
my ($encoding, $credits, $ch, $progs) = @{XMLTV::parsefiles(@ARGV)};
my $w = new XMLTV::Writer(%w_args, encoding => $encoding);
$w->start($credits);
$w->write_channels($ch);

# Since zero-length and unknown-length programmes are always written
# unchanged, we could write them immediately and discard them.  But
# it's a bit nicer to write the output in the same order as the input.
# However, we don't bother to index these programmes, they cannot /
# should not be used in looking for partitionings.
#
my %by_channel_and_start;
foreach (@$progs) {
    push @{$by_channel_and_start{$_->{channel}}{pd $_->{start}}}, $_
      if interesting $_;
}
$w->write_programme($_) foreach grep { should_write($_) } @$progs;
$w->end();
exit();

# Given that %by_channel_and_start and %boring_programme_key have been
# set up, should a programme (with start and stop time) be written?
#
sub should_write( $ ) {
    my $p = shift;

    # Always write zero length and unknown-length programmes.
    return 1 if not interesting $p;

    # If this programme cannot be partitioned by at least two others,
    # definitely write it.
    #
    return 1
      unless exists_partition(pd $p->{start}, pd $p->{stop},
			      $p->{channel},
			      2, { $p => 1 });

    foreach (keys %$p) {
	if (not $boring_programme_key{$_}) {
	    warn <<END
not filtering programme at $p->{start} on $p->{channel} because it has $_
END
  ;
	    return 1;
	}
    }

    return 0;
}

# We process only programmes with stop time and nonzero length.
sub interesting( $ ) {
    my $p = shift;
    my $stop = $p->{stop};
    return 0 if not defined $stop;
    my $cmp = Date_Cmp(pd $p->{start}, pd $stop);
    if ($cmp < 0) {
	# start < stop, okay.
	return 1;
    }
    elsif ($cmp == 0) {
	# Zero length, won't consider.
	return 0;
    }
    elsif ($cmp > 0) {
	warn "programme on $p->{channel} "
	  . "with start time ($p->{start}) "
	    . "before stop time ($stop)\n";
	return 0;
    }
    else { die }
}

# Does there exist a sequence of programmes hopping from $start to
# $stop, where none of the programmes is in $used and the sequence is
# at least $min_length long?
#
# $start and $stop are Date::Manip objects, $used a hash whose keys
# are used programmes and whose values are true.
#
sub exists_partition( $$$$$ ) {
    my ($start, $stop, $channel, $min_length, $used) = @_;
#    local $Log::TraceMessages::On = 1;
    t "seeking at least $min_length $start to $stop on $channel";
    t '... not including: ' . d $used;
    my $cmp = Date_Cmp($start, $stop);
    if ($cmp < 0) {
	t 'start before stop, okay';
	my @poss = grep { not $used->{$_} }
	  @{$by_channel_and_start{$channel}{$start}};
	t 'possible first steps of path: ' . d \@poss;
	--$min_length if $min_length;
	foreach my $p (@poss) {
	    return 1
	      if exists_partition(pd $p->{stop}, $stop, $channel,
				  $min_length, { %$used, $p => 1 });
	}
	t 'no paths found';
	return 0;
    }
    elsif ($cmp == 0) {
	t 'zero length, so path of length zero';
	return not $min_length;
    }
    elsif ($cmp > 0) {
	t 'stop < start, so no path';
	return 0;
    }
    else { die }
}

# Lift parse_date() to handle undef.
sub pd( $ ) {
    for ($_[0]) {
	return undef if not defined;
	return parse_date($_);
    }
}

exit 1;
