package CharacterSet::SJIS;
#
#    SJIS.pm
#
#    $Id: SJIS.pm,v 1.2 2001/02/11 03:27:52 bstell Exp $
#
#    Copyright (c) 2000 Brian Stell
#
#    This package is free software and is provided ``as is'' without
#    express or implied warranty. It may be used, redistributed and/or
#    modified under the terms of the Perl Artistic License 
#    (see http://www.perl.com/perl/misc/Artistic.html)
#

use strict;
use utf8;
use Carp;
use Exporter;
use ICU::Unicode qw(:EUnicodeEnums :EUnicodeScriptGenericTypes
                    :EUnicodeScript :EDirectionProperty :ECellWidths);
use ICU::UConverter;

use vars qw(@ISA %EXPORT_TAGS
            $DEBUG_ismbcalnum 
            $DEBUG_ismbckata
            $DEBUG_ismbcl0
            $DEBUG_ismbcl1
            $DEBUG_ismbcl2
            $DEBUG_isSingle
            $DEBUG_isMultiKanji
            $DEBUG_isDoubleKatakana
            );
@ISA = qw(Exporter);

%EXPORT_TAGS = (
    SjisFuncs => [qw( 
                  &_ismbcalnum 
                  &_ismbckata
                  &_ismbcl0
                  &_ismbcl1
                  &_ismbcl2
                  &isSingle
                  &isMultiKanji
                  &isDoubleKatakana
                 )],
);

Exporter::export_tags();

$DEBUG_ismbcalnum = 0;
$DEBUG_ismbckata = 0;
$DEBUG_ismbcl0 = 0;
$DEBUG_ismbcl1 = 0;
$DEBUG_ismbcl2 = 0;
$DEBUG_isSingle = 0;
$DEBUG_isMultiKanji = 0;
$DEBUG_isDoubleKatakana = 0;


sub DESTROY {
    my ($self) = @_;
    $self->{UCONVERTER} = undef;
}

sub isSingle {
    my ($self, $utf8_str) = @_;
    if ($DEBUG_isSingle) { print STDERR "In isSIngle routine \n"; }
    if ((2 != @_) || (!ref($self)))  { confess 'usage: $cs->isSingle($utf8_str)'; return(0); }

    if (!defined($utf8_str)) {
        return(0);
    }
    my @utf8_chars = split('', $utf8_str);
    if ($DEBUG_isSingle) { print STDERR "length of ($utf8_str) is ".scalar(@utf8_chars)."\n"; }
    if (scalar(@utf8_chars) == 0) {
        return(0);
    }
    foreach my $utf8_char (@utf8_chars) {
        $self->{UCONVERTER}->reset();
        my $sjis_char = $self->{UCONVERTER}->ucnv_fromUChars($utf8_char);

        if (!defined($sjis_char)) {
            if ($DEBUG_isSingle) { print STDERR "no data\n"; }
            return(0);
        }

        my ($sjis1,$sjis2,$sjis3) = unpack('C*', $sjis_char);
        # should check if $sjis3 is not undef
        if (defined($sjis2)) {
            if ($DEBUG_isSingle) { print STDERR "more than one byte\n"; }
            return(0);
        }

        if ((0x0000 <= $sjis1) && ($sjis1 <= 0x007F)) {
            if ($DEBUG_isSingle) { printf STDERR "utf8_char ($sjis_char - 0x%04x ) is SingleByteField\n",$sjis1; }
            next;
        }
        if ($DEBUG_isSingle) { printf STDERR "utf8_char ($sjis_char - 0x%04x ) is single byte but not SingleByteField.\n", $sjis1; }
        return(0);
    }
    return(1);
}

sub isDoubleKatakana {
    my ($self, $utf8_str) = @_;
    if ($DEBUG_isDoubleKatakana) { print STDERR "In isDoubleKatakana routine\n"; }
    if ((2 != @_) || (!ref($self)))  { confess 'usage: $cs->isDoubleKatakana($utf8_str)'; return(0); }

    if (!defined($utf8_str)) {
        return(0);
    }
    my @utf8_chars = split('', $utf8_str);
    if ($DEBUG_isDoubleKatakana) { print STDERR "length of ($utf8_str) is ".scalar(@utf8_chars)."\n"; }
    if (scalar(@utf8_chars) == 0) {
        return(0);
    }
    foreach my $utf8_char (@utf8_chars) {
        #if ($DEBUG_isDoubleKatakana) { print STDERR "Checking if $utf8_char is Japanese char\n"; }
        $self->{UCONVERTER}->reset();
        my $sjis_char = $self->{UCONVERTER}->ucnv_fromUChars($utf8_char);

        if (!defined($sjis_char)) {
            if ($DEBUG_isDoubleKatakana) { print STDERR "no data\n"; }
            return(0);
        }

        my ($sjis1,$sjis2,$sjis3) = unpack('C*', $sjis_char);
        # should check if $sjis3 is not undef
        if (!defined($sjis2)) {
            if ($DEBUG_isDoubleKatakana) { print STDERR "only one byte\n"; }
            return(0);
        }

        my $val = ($sjis1<<8) + $sjis2; 
        if ($DEBUG_isDoubleKatakana) { printf STDERR "val = 0x%04x\n", $val; }
        if ((0x8340 <= $val) && ($val <= 0x8396)) {
            if ($DEBUG_isDoubleKatakana) { print STDERR "is kata\n"; }
            next;
        }
        if ($DEBUG_isDoubleKatakana) { print STDERR "is multibyte but not kata\n"; }
        return(0);
    }
    return(1);
}

sub isMultiKanji {
    my ($self, $utf8_str) = @_;
    if ($DEBUG_isMultiKanji) { print STDERR "In MultiKanji routine\n"; }
    if ((2 != @_) || (!ref($self)))  { confess 'usage: $cs->isMultiKanji($utf8_str)'; return(0); }

    if (!defined($utf8_str)) {
        return(0);
    }
    my @utf8_chars = split('', $utf8_str);
    if ($DEBUG_isMultiKanji) { print STDERR "---->>length of ($utf8_str) is ".scalar(@utf8_chars)."\n"; }
    if (scalar(@utf8_chars) == 0) {
        return(0);
    }
    foreach my $utf8_char (@utf8_chars) {
        if ($DEBUG_isMultiKanji) {
            printf(STDERR "Checking %s (0x%04x)\n", $utf8_char, utf8_to_ucs4($utf8_char));
        }
        # Accept the character if it passes the single byte test
        if ($DEBUG_isMultiKanji) { print STDERR "Checking if $utf8_char is singleByte\n"; }
        next if ($self->isSingle($utf8_char));
        if ($DEBUG_isMultiKanji) { print STDERR "$utf8_char is not a single Byte field. Checking is it is Japanese char\n"; }
    
        $self->{UCONVERTER}->reset();
        my $sjis_char = $self->{UCONVERTER}->ucnv_fromUChars($utf8_char);

        if (!defined($sjis_char)) {
            if ($DEBUG_isMultiKanji) { print STDERR "no data\n"; }
            return(0);
        }

        my ($sjis1,$sjis2,$sjis3) = unpack('C*', $sjis_char);
        # should check if $sjis3 is not undef
        if (!defined($sjis2)) {
            if ($DEBUG_isMultiKanji) {
                printf(STDERR "%s (0x%04x): only one byte of Shift-JIS (%02x)\n", 
                                    $utf8_char, utf8_to_ucs4($utf8_char), $sjis1);
            }
            return(0);
        }

        my $val = ($sjis1<<8) + $sjis2; 
        if ( ((0x0000 <= $val) && ($val <= 0x007F)) ||
             ((0x8140 <= $val) && ($val <= 0x889E)) ||
             ((0x889F <= $val) && ($val <= 0x9872)) ||
             ((0x989F <= $val) && ($val <= 0xEA9E))
             ) {
            if ($DEBUG_isMultiKanji) { printf STDERR "val: 0x%04x is multi-kanji\n", $val; }
            next;
        }
        if ($DEBUG_isMultiKanji) { printf STDERR "val: 0x%04x is multibyte but not multi-kanji\n", $val; }
        return(0);
    }
    return(1);
}

sub _ismbcalnum {
    my ($self, $utf8_str) = @_;
    if ((2 != @_) || (!ref($self)))  { confess 'usage: $cs->_ismbcalnum($utf8_str)'; return(0); }

    if (!defined($utf8_str)) {
        return(0);
    }
    my @utf8_chars = split('', $utf8_str);
    if ($DEBUG_ismbcalnum) { print STDERR "length of ($utf8_str) is ".scalar(@utf8_chars)."\n"; }
    if (scalar(@utf8_chars) == 0) {
        return(0);
    }
    foreach my $utf8_char (@utf8_chars) {
        $self->{UCONVERTER}->reset();
        my $sjis_char = $self->{UCONVERTER}->ucnv_fromUChars($utf8_char);

        if (!defined($sjis_char)) {
            if ($DEBUG_ismbcalnum) { print STDERR "no data\n"; }
            return(0);
        }

        my ($sjis1,$sjis2,$sjis3) = unpack('C*', $sjis_char);
        # should check if $sjis3 is not undef
        if (defined($sjis2)) {
            if ($DEBUG_ismbcalnum) { print STDERR "more than one byte\n"; }
            return(0);
        }

        if ($DEBUG_ismbcalnum) { printf STDERR "sjis1 = 0x%04x\n", $sjis1; }
        if (   ((0x0030 <= $sjis1) && ($sjis1 <= 0x0039))
            || ((0x0041 <= $sjis1) && ($sjis1 <= 0x005A))
            || ((0x0061 <= $sjis1) && ($sjis1 <= 0x007A))
            ) {
            if ($DEBUG_ismbcalnum) { printf STDERR "$utf8_char ($sjis_char (0x%04x)) is alnum\n",$sjis1; }
            next;
        }
        if ($DEBUG_ismbcalnum) { print STDERR "is single byte but not alnum\n"; }
        return(0);
    }
    return(1);
}

sub _ismbckata {
    my ($self, $utf8_str) = @_;
    if ((2 != @_) || (!ref($self)))  { confess 'usage: $cs->_ismbckata($utf8_str)'; return(0); }

    if (!defined($utf8_str)) {
        return(0);
    }
    my @utf8_chars = split('', $utf8_str);
    if ($DEBUG_ismbckata) { print STDERR "length of ($utf8_str) is ".scalar(@utf8_chars)."\n"; }
    if (scalar(@utf8_chars) == 0) {
        return(0);
    }
    foreach my $utf8_char (@utf8_chars) {
        $self->{UCONVERTER}->reset();
        my $sjis_char = $self->{UCONVERTER}->ucnv_fromUChars($utf8_char);

        if (!defined($sjis_char)) {
            if ($DEBUG_ismbckata) { print STDERR "no data\n"; }
            return(0);
        }

        my ($sjis1,$sjis2,$sjis3) = unpack('C*', $sjis_char);
        # should check if $sjis3 is not undef
        if (!defined($sjis2)) {
            if ($DEBUG_ismbckata) { print STDERR "only one byte\n"; }
            return(0);
        }

        my $val = ($sjis1<<8) + $sjis2; 
        if ($DEBUG_ismbckata) { printf STDERR "val = 0x%04x\n", $val; }
        if ((0x8340 <= $val) && ($val <= 0x8396)) {
            if ($DEBUG_ismbckata) { print STDERR "is kata\n"; }
            next;
        }
        if ($DEBUG_ismbckata) { print STDERR "is multibyte but not kata\n"; }
        return(0);
    }
    return(1);
}

sub _ismbcl0 {
    my ($self, $utf8_str) = @_;
    if ((2 != @_) || (!ref($self)))  { confess 'usage: $cs->_ismbcl0($utf8_str)'; return(0); }

    if (!defined($utf8_str)) {
        return(0);
    }
    my @utf8_chars = split('', $utf8_str);
    if ($DEBUG_ismbcl0) { print STDERR "length of ($utf8_str) is ".scalar(@utf8_chars)."\n"; }
    if (scalar(@utf8_chars) == 0) {
        return(0);
    }
    foreach my $utf8_char (@utf8_chars) {
        $self->{UCONVERTER}->reset();
        my $sjis_char = $self->{UCONVERTER}->ucnv_fromUChars($utf8_char);

        if (!defined($sjis_char)) {
            if ($DEBUG_ismbcl0) { print STDERR "no data\n"; }
            return(0);
        }

        my ($sjis1,$sjis2,$sjis3) = unpack('C*', $sjis_char);
        # should check if $sjis3 is not undef
        if (!defined($sjis2)) {
            if ($DEBUG_ismbcl0) { print STDERR "only one byte\n"; }
            return(0);
        }

        my $val = ($sjis1<<8) + $sjis2; 
        if ($DEBUG_ismbcl0) { printf STDERR "val = 0x%04x\n", $val; }
        if ((0x8140 <= $val) && ($val <= 0x889E)) {
            if ($DEBUG_ismbcl0) { print STDERR "is L0\n"; }
            next;
        }
        if ($DEBUG_ismbcl0) { print STDERR "is multibyte but not L0\n"; }
        return(0);
    }
    return(1);
}

sub _ismbcl1 {
    my ($self, $utf8_str) = @_;
    if ((2 != @_) || (!ref($self)))  { confess 'usage: $cs->_ismbcl1($utf8_str)'; return(0); }

    if (!defined($utf8_str)) {
        return(0);
    }
    my @utf8_chars = split('', $utf8_str);
    if (scalar(@utf8_chars) == 0) {
        return(0);
    }
    foreach my $utf8_char (@utf8_chars) {
        $self->{UCONVERTER}->reset();
        my $sjis_char = $self->{UCONVERTER}->ucnv_fromUChars($utf8_char);

        if (!defined($sjis_char)) {
            if ($DEBUG_ismbcl1) { print STDERR "no data\n"; }
            return(0);
        }

        my ($sjis1,$sjis2,$sjis3) = unpack('C*', $sjis_char);
        # should check if $sjis3 is not undef
        if (!defined($sjis2)) {
            if ($DEBUG_ismbcl1) { print STDERR "only one byte\n"; }
            return(0);
        }

        my $val = ($sjis1<<8) + $sjis2; 
        if ($DEBUG_ismbcl1) { printf STDERR "val = 0x%04x\n", $val; }
        if ((0x889F <= $val) && ($val <= 0x9872)) {
            if ($DEBUG_ismbcl1) { print STDERR "is L1\n"; }
            next;
        }
        if ($DEBUG_ismbcl1) { print STDERR "is multibyte but not L1\n"; }
        return(0);
    }
    return(1);
}

sub _ismbcl2 {
    my ($self, $utf8_str) = @_;
    if ((2 != @_) || (!ref($self)))  { confess 'usage: $cs->_ismbcl2($utf8_str)'; return(0); }

    if (!defined($utf8_str)) {
        return(0);
    }
    my @utf8_chars = split('', $utf8_str);
    if (scalar(@utf8_chars) == 0) {
        return(0);
    }
    foreach my $utf8_char (@utf8_chars) {
        $self->{UCONVERTER}->reset();
        my $sjis_char = $self->{UCONVERTER}->ucnv_fromUChars($utf8_char);

        if (!defined($sjis_char)) {
            if ($DEBUG_ismbcl2) { print STDERR "no data\n"; }
            return(0);
        }

        my ($sjis1,$sjis2,$sjis3) = unpack('C*', $sjis_char);
        # should check if $sjis3 is not undef
        if (!defined($sjis2)) {
            if ($DEBUG_ismbcl2) { print STDERR "only one byte\n"; }
            return(0);
        }

        my $val = ($sjis1<<8) + $sjis2; 
        if ($DEBUG_ismbcl2) { printf STDERR "val = 0x%04x\n", $val; }
        if ((0x989F <= $val) && ($val <= 0xEA9E)) {
            if ($DEBUG_ismbcl2) { print STDERR "is L2\n"; }
            next;
        }
        if ($DEBUG_ismbcl2) { print STDERR "is multibyte but not L2\n"; }
        return(0);
    }
    return(1);
}

sub new {
    my $proto = shift;
    my $class = ref($proto) || $proto;
    my $self  = {};

    $self->{UCONVERTER} = new ICU::UConverter('SJIS');
    if (!$self->{UCONVERTER}) {
        return(undef);
    }
    bless ($self, $class);
    return $self;
}


sub utf8_to_ucs4 {
    my ($utf8_char) = @_;

    print STDERR "char = ($utf8_char) ".__FILE__.' '.__LINE__."\n";
    my ($byte1,$byte2,$byte3,$byte4) = unpack('C*', $utf8_char);
    printf(STDERR "    = (0x%02x 0x%02x 0x%02x 0x%02x)\n", 
                       $byte1||0, , $byte2||0, , $byte3||0, , $byte4||0, );
    my $val; 
    if ($byte1 <= 0x7f) { # one byte
        $val = $byte1; 
    }
    elsif ($byte1 <= 0xDF) { # two bytes
        $val = (($byte1&0x1F)<<6) + ($byte2&0x3f); 
    }
    elsif ($byte1 <= 0xEF) { # three bytes
        $val = (($byte1&0x0F)<<12) + (($byte2&0x3f)<<6) + ($byte3&0x3f); 
    }
    elsif ($byte1 <= 0xF7) { # four bytes
        $val = (   ($byte1&0x07)<<18) + (($byte2&0x3f)<<12)
                + (($byte3&0x3f)<<6)  +  ($byte4&0x3f); 
    }
    else {
        $val = 0xFFFF; # undefined 
    }
    return($val);
}

1;  # so the require or use succeeds

__END__


=head1 NAME

ICU:SJIS.pm - Perl extension for International Components for Unicode

=head1 SYNOPSIS

  use ICU:SJIS;

=head1 DESCRIPTION

TBD

