package OCRImageConverter;

use BaseMediaConverter;

use strict;
use warnings;
no strict 'refs'; # allow filehandles to be variables and viceversa
no strict 'subs'; # allow barewords (eg STDERR) as function arguments

use gsprintf 'gsprintf';

# these two variables mustn't be initialised here or they will get stuck 
# at those values.
our $ocrimage_conversion_available;
our $no_ocrimage_conversion_reason;

BEGIN {
    @OCRImageConverter::ISA = ('BaseMediaConverter');

    # Check that Tesseract is installed and available on the path 
    $ocrimage_conversion_available = 1;
    
    if (!defined $ENV{'GEXTOCR'}) {
	    $ocrimage_conversion_available = 0;
	    $no_ocrimage_conversion_reason = "gextocrnotinstalled";
    } else {
	    my $gextocr_home = $ENV{'GEXTOCR'};
	    #my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
        my $tesseract = &FileUtils::filenameConcatenate($gextocr_home, "installed", "cmdline", "bin", "tesseract");
        
	    if (!-e $tesseract) {
	        &gsprintf(STDERR, "**** Failed to find $tesseract\n");
	        $ocrimage_conversion_available = 0;
	        $no_ocrimage_conversion_reason = "gexttesseractnotinstalled";
	    } else {
	        my $cmd = "\"$tesseract\" -v ";
	        if ($ENV{'GSDLOS'} =~ /^windows/i) {
		        $cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
	        } else {
		        # On Ubuntu, java >/dev/null 2>&1 works, 
		        # but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
		        $cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it)
	        }

	        my $status = system($cmd);

	        if ($status != 0) {
		        my $error_message =  "**** Testing for Tesseract\n";
		        $error_message .= "Failed to run: $cmd\n";
		        $error_message .= "Error variable: |$!| and status: $status\n";

		        &gsprintf(STDERR, "OCRImageConverter: $error_message");

		        $ocrimage_conversion_available = 0;
		        $no_ocrimage_conversion_reason = "couldnotruntesseract";
	        } 
	    }

        my $cpan = &FileUtils::filenameConcatenate($gextocr_home, "installed", "cmdline", "cpan", "lib64", "perl5");
        &gsprintf(STDERR, "OCRImageConverter: CPAN directory $cpan\n");
        if (! -d $cpan) {
            $ocrimage_conversion_available = 0;
            $no_ocrimage_conversion_reason = "gexthtmltokenotinstalled";
        } else {
            push(@INC, $cpan);
        }
    }
}

use HTML::TokeParser;

my $layout_list = [
    {
        'name' => "none",
        'desc' => "{OCRImageConverter.unpaper_layout.none}"
    },
    {
        'name' => "single",
        'desc' => "{OCRImageConverter.unpaper_layout.single}"
    },
    {
        'name' => "double",
        'desc' => "{OCRImageConverter.unpaper_layout.double}"
    }
];

my $arguments = [
    {
        'name' => 'use_unpaper',
        'desc' => "{OCRImageConverter.use_unpaper}",
        'type' => "flag",
        'deft' => 0,
        'reqd' => "no"
    },
    {
        'name' => 'unpaper_layout',
        'desc' => "{OCRImageConverter.unpaper_layout}",
        'type' => "enum",
        'reqd' => "yes",
        'list' => $layout_list,
        'deft' => "none"
    }
];

my $opt_diva_args = [
    {
        'name' => 'diva_support',
        'desc' => "{OCRImageConverter.diva_support}",
        'type' => "flag",
        'deft' => "yes",
        'reqd' => "no"
    }
];

my $options = {
    'name' => "OCRImageConverter",
	'desc' => "{OCRImageConverter.desc}",
	'abstract' => "yes",
	'inherits' => "yes",
	'args' => $arguments
};

sub new {
    my ($class) = shift (@_);
    my ($pluginlist, $inputargs, $hashArgOptLists, $auxilary) = @_;
    push(@$pluginlist, $class);

    if (defined $ENV{'GEXTDIVA'}) {
        push(@$arguments, @$opt_diva_args);
    }

    push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
    push(@{$hashArgOptLists->{"OptList"}}, $options);

    my $self = new BaseMediaConverter($pluginlist, $inputargs, $hashArgOptLists, $auxilary);

    if ($self->{'info_only'}) {
	    # don't worry about any options etc
	    return bless $self, $class;
    }
    
    if ($ocrimage_conversion_available) {
	    my $gextocr_home = $ENV{'GEXTOCR'};
	    my $tesseract = &FileUtils::filenameConcatenate($gextocr_home, "installed", "cmdline", "bin", "tesseract");
	    my $lang = "eng"; # TODO
	    my $launch_cmd = "\"$tesseract\" -l $lang ";
	
	    $self->{'ocrimage_launch_cmd'} = $launch_cmd;
    } else {       
	    $self->{'no_ocrimage_conversion_reason'} = $no_ocrimage_conversion_reason;

	    my $outhandle = $self->{'outhandle'};
	    &gsprintf($outhandle, "OCRImageConverter: {OCRImageConverter.noconversionavailable} ({OCRImageConverter.$no_ocrimage_conversion_reason})\n");
    }  

    $self->{'ocrimage_conversion_available'} = $ocrimage_conversion_available;
    
    return bless $self, $class;
}

sub init {
    my $self = shift(@_);
    my ($verbosity, $outhandle, $failhandle) = @_;

    $self->{'ocrtmp_file_paths'} = ();
}

sub deinit {
    my $self = shift(@_);

    $self->clean_up_temporary_files();
}

sub convert {
    my $self = shift(@_);
    my ($source_file_full_path) = @_;
    
    # TODO cache
    if (lc(substr($source_file_full_path, length($source_file_full_path) - 4)) eq ".gif") {
        my $desired_extension = "tif";
        if ($self->{'use_unpaper'}) {
            $desired_extension = "pnm";
        }
        
        # need to convert to another format first
        #my $target_source = substr($source_file_full_path, length($source_file_full_path) - 4) . ".tif";
        my $converted_file = &util::get_tmp_filename($desired_extension);
        push(@{$self->{'ocrtmp_file_paths'}}, $converted_file);
        
        my $imagick_cmd = "\"" . &util::get_perl_exec() . "\" -S gs-magick.pl convert \"$source_file_full_path\" \"$converted_file\"";
        system($imagick_cmd);
        
        print STDERR "*** Magick command: $imagick_cmd\n";
        
        if ($self->{'use_unpaper'}) {
            my $unpaper_cmd = "unpaper \"$converted_file\" \"$converted_file\" --overwrite --layout " . $self->{'unpaper_layout'};
            system($unpaper_cmd);
            print STDERR "*** unpaper command: $unpaper_cmd\n";
        }
        
        $source_file_full_path = $converted_file;
        print STDERR "*** Converted file: $converted_file\n";
    }
    
    my $source_file_no_path = &File::Basename::basename($source_file_full_path);
    
    return 0 unless $ocrimage_conversion_available;
    return 0 if (!-f $source_file_full_path);
    
    my $outhandle = $self->{'outhandle'};
    my $verbosity = $self->{'verbosity'};
    
    my $target_file_path;
    
    if ($self->{'enable_cache'}) {
        $self->init_cache_for_file($source_file_full_path);
        my $cache_dir = $self->{'cached_dir'};
        my $file_root = $self->{'cached_file_root'};
        
        $target_file_path = &FileUtils::filenameConcatenate($cache_dir, $file_root);
    } else {
        $target_file_path = &util::get_tmp_filename("txt");
        push(@{$self->{'ocrtmp_file_paths'}}, $target_file_path);
    }
    
    my $convert_cmd = "";
    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$");
    
    $convert_cmd = $self->{'ocrimage_launch_cmd'};
    $convert_cmd .= " \"$source_file_full_path\" \"" . substr($target_file_path, 0, length($target_file_path) - 4) . "\"";
    
    #&gsprintf(STDERR, "OCRImageConverter convert command: $convert_cmd\n");
    print STDERR "OCRImageConverter convert command: $convert_cmd\n";
    
    my $print_info = {
        'message_prefix' => "OCR Conversion",
        'message' => "Converting $source_file_no_path."
    };
    
    my ($regenerated, $result, $had_error) = $self->autorun_general_cmd($convert_cmd, $source_file_full_path, $target_file_path, $print_info);
    $self->autorun_general_cmd($convert_cmd . " hocr", $source_file_full_path, $target_file_path, $print_info);
    
    # HOCR
    my $hocr = substr($target_file_path, 0, length($target_file_path) - 4) . ".hocr";
    
    my $json = "[";

    open(my $fh, $hocr) || die "Can't open HOCR file $hocr!";
    #open(my $fh, $hocr);
    my $parser = HTML::TokeParser->new($fh);
    
    my $state = "idle";
    my $bbox = "";
    my $text = "";
    
    while (my $token = $parser->get_token()) {
        if ($state eq "idle" && $token->[0] eq "S" && $token->[1] eq "span") {
            $state = "inSpan";
            $bbox = $token->[2]{'title'};
        }
        
        if ($state eq "inSpan" && $token->[0] eq "T") {
#            $state = "gotText";
            $text = $token->[1];
            $state = "gotText";
            chomp $text;
        }
        
        if ($state eq "gotText" && $token->[0] eq "E") {
            $state = "idle";
            #next unless $text =~ /\w/;
            
            my @numbers = $bbox =~ /(\d+)/g;
            $json .= "[\"$text\",[${numbers[0]},${numbers[1]},${numbers[2]},${numbers[3]}]],";
        }
    }

    close($fh);

    $json = substr($json, 0, length($json) - 1); # remove trailing comma
    $json .= "]";

    open(my $oh, '>', $hocr . '.json');
    print $oh "$json";
    close($oh);
    
    # Diva.js
    #if (defined $EXT{'GEXTDIVA'}) {
    #    if ($self->{'diva_support'}) {
    #        my $webroot = &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'}, '..', 'packages', 'tomcat', 'webapps', 'iipsrv', 'collect', '<HASH>');
    #        my $processpy = &FileUtils::filenameConcatenate($ENV{'GEXTDIVA_INSTALLED'}, 'cmdline', 'python');
    #        my $DS = &util::get_dirsep();
    #        my $convert = `which convert`;
    #        
    #        $print_info = {
    #            'message_prefix' => "Diva.js processing",
    #            'message' => "Now processing images for use with Diva.js..."
    #        };
    #
    #        my $process_cmd = "python \"$processpy\" \"<IN>\" \"$webroot${DS}img\" \"$webroot${DS}data\" -t tiff -i \"$convert\"";
    #        $self->autorun_general_cmd($process_cmd, "<IN>", $print_info);
    #    }
    #}
    
    if ($had_error) {
        return (0, $result, $target_file_path);
    } else {
        return (1, $result, $target_file_path);
    }
}

sub test {
    my $self = shift(@_);
    print STDERR "**** test working\n";
}

sub convert_without_result {
    my $self = shift(@_);

    my $source_file_path = shift(@_);
    my $target_file_type = shift(@_);
    my $convert_options  = shift(@_) || "";
    my $convert_id       = shift(@_) || "";

    return $self->convert($source_file_path,$target_file_type,
			  $convert_options,$convert_id,"without_result");
}

sub clean_up_temporary_files {
    my $self = shift(@_);

    foreach my $ocrtmp_file_path (@{$self->{'ocrtmp_file_paths'}}) {
	    if (-d $ocrtmp_file_path) {
	        #print STDERR "@@@@@@ cleanup called on $pbtmp_file_path\n";
	        &FileUtils::removeFilesRecursive($ocrtmp_file_path);
	    } elsif (-e $ocrtmp_file_path) {
	        &FileUtils::removeFiles($ocrtmp_file_path);
	    }
    }

    $self->{'ocrtmp_file_paths'} = ();
}

1;
