#####################################################################
#
# TesseractTextExtractor.pm -- helper plugin that allows other plugins
# (such as ImagePlugin and PagedImagePlugin) to extend their
# processing capability through sub-classing inheritence (such as
# TesseractImagePlugin and TesseractPagedImagePlugin) to
# expand the image processing capabilities at ingest time to
# include Tesseract allowing for:  OCR text recognition.
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 2024 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package TesseractTextExtractor;

use BaseMediaConverter;

use strict;
no  strict 'refs'; # allow filehandles to be variables and viceversa
no  strict 'subs';

use gsprintf 'gsprintf';
use FileUtils;


sub BEGIN {
    @TesseractTextExtractor::ISA = ('BaseMediaConverter');
}


my $arguments = [
    { 'name' => "ocr_language",
          'desc' => "{TesseractTextExtractor.ocr_language}",
          'type' => "string",
          'reqd' => "no",
          'deft' => "eng"
    }
];

my $options = { 'name'     => "TesseractTextExtractor",
		'desc'     => "{TesseractTextExtractor.desc}",
		'abstract' => "yes",
		'inherits' => "yes",
		'args'     => $arguments };

sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
    push(@{$hashArgOptLists->{"OptList"}},$options);

    # The 1 here is for us to set up arg structures, but not to do the parsing of args here
    my $self = new BaseMediaConverter($pluginlist, $inputargs, $hashArgOptLists, 1);

    return bless $self, $class;
}
     
sub init {
    my $self = shift(@_);
    # test to see whether we can run tesseract
    my $ocrimage_conversion_available = 1;
    my $no_ocrimage_conversion_reason = "";

# don't use this, as maybe we have tesseract some other way, not using the GS extension?  
#    if (!defined $ENV{'GEXT_TESSERACT'}) {
#        $ocrimage_conversion_available = 0;
#        $no_ocrimage_conversion_reason = "gexttesseractnotinstalled";
#    } else {

    my $cmd = "tesseract -v ";
    if ($ENV{'GSDLOS'} =~ /^windows/i) {
        $cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
    } else {
        # On Ubuntu, java >/dev/null 2>&1 works, 
        # but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
        $cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it)
    }
        
    my $status = system($cmd);

    if ($status != 0) {
        my $error_message =  "**** Testing for Tesseract\n";
        $error_message .= "Failed to run: $cmd\n";
        $error_message .= "Error variable: |$!| and status: $status\n";
            
        &gsprintf(STDERR, "TesseractTextExtractor: $error_message");
            
        $ocrimage_conversion_available = 0;
        $no_ocrimage_conversion_reason = "couldnotruntesseract";
    } else {
        &gsprintf(STDERR,  "**** Testing for Tesseract: status was 0 - ok");
    }


    $self->{'ocrimage_conversion_available'} = $ocrimage_conversion_available;
    $self->{'no_ocrimage_cinversion_reason'} = $no_ocrimage_conversion_reason;
    
    if ($self->{'ocrimage_conversion_available'} == 0) {
        my $outhandle = $self->{'outhandle'};
        &gsprintf($outhandle, "TesseractTextExtractor: {TesseractTextExtractor.noconversionavailable} ({TesseractTextExtractor.". $self->{'no_ocrimage_cinversion_reason'} ."})\n");

    }

    $self->{'ocrtmp_file_paths'} = ();
}


sub run_tesseract { 
    my $self = shift(@_);
    my ($source_file_full_path) = @_;

    return (0, "ocrconversion not available") unless $self->{"ocrimage_conversion_available"};
    return (0, "file doesn't exist") if (!-f $source_file_full_path);
    
    my $outhandle = $self->{'outhandle'};
    my $verbosity = $self->{'verbosity'};
    
    my $target_file_path;
    
    if ($self->{'enable_cache'}) {
        $self->init_cache_for_file($source_file_full_path);
        my $cache_dir = $self->{'cached_dir'};
        my $file_root = $self->{'cached_file_root'};
        
        $target_file_path = &FileUtils::filenameConcatenate($cache_dir, $file_root);
        
    } else {
        $target_file_path = &util::get_tmp_filename("txt");
        push(@{$self->{'ocrtmp_file_paths'}}, $target_file_path);
        $target_file_path =~ s/\.[^\.]+$//; # remove file extension as Tesseract will add it
    }
    
    my $convert_cmd = "tesseract "; 
    $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\" txt"; 
    
    &gsprintf (STDERR, "TesseractTextExtractor convert command: $convert_cmd\n");
    
    my $print_info = {
        'message_prefix' => "Tesseract OCR Text Extraction",
        'message' => "Extracting text from  $source_file_full_path"
    };
    
    my ($regenerated, $result, $had_error) = $self->autorun_general_cmd($convert_cmd, $source_file_full_path, $target_file_path.".txt", $print_info);
    
    if ($had_error) {
        return (0, $result);
    } else {
        return (1, $result, $target_file_path.".txt");
    }
}

sub clean_up_temporary_files {
    my $self = shift(@_);
    foreach my $ocrtmp_file_path (@{$self->{'ocrtmp_file_paths'}}) {
        if (-e $ocrtmp_file_path) {
            print STDERR "removing OCR tmp file $ocrtmp_file_path\n" if $self->{'verbosity'} >2;
            &FileUtils::removeFiles($ocrtmp_file_path);
        } 
    }
    
    $self->{'ocrtmp_file_paths'} = ();
}

1;

