######################################################################
#
# TesseractImagePlugin.pm -- plugin that extends the capability of 
# ImagePlugin to use Tesseract for OCR text recognition.
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 2024 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package TesseractImagePlugin;

use strict;
no  strict 'refs'; # allow filehandles to be variables and viceversa
no  strict 'subs';

use utf8;

use gsprintf;
use FileUtils;
use util;

use ImagePlugin;
use TesseractTextExtractor;
use ReadTextFile;

sub BEGIN {
    @TesseractImagePlugin::ISA = ('ImagePlugin', 'ReadTextFile', 'TesseractTextExtractor');
}

my $arguments = [];


my $options = { 'name'     => "TesseractImagePlugin",
		'desc'     => "{TesseractImagePlugin.desc}",
		'abstract' => "no",
		'inherits' => "yes",
		'args'     => $arguments };

sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
    push(@{$hashArgOptLists->{"OptList"}},$options);

    # the 1 for auxiliary means don't parse the args during new...
    # we want to set up the arg structures, and then only parse them on the last plugin
    new TesseractTextExtractor($pluginlist, $inputargs, $hashArgOptLists,1);
    new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
    my $self = new ImagePlugin($pluginlist, $inputargs, $hashArgOptLists);
    
    return bless $self, $class;
}

sub init {
    my $self = shift (@_);
    my ($verbosity, $outhandle, $failhandle) = @_;

    $self->SUPER::init(@_);
    $self->TesseractTextExtractor::init();

}


sub auto_extract_metadata {
    my $self = shift(@_);

    $self->ReadTextFile::auto_extract_metadata(@_);
}

# do plugin specific processing of doc_obj
sub process {
    my $self = shift (@_);
    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;

    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);

    my ($success, $result, $output_file) = $self->run_tesseract($filename_full_path);
    if ($success) {
        $self->process_text($output_file, $doc_obj, $doc_obj->get_top_section());
    } else {
        print STDERR "conversion didn't work, result: $result\n";
    }

    $self->SUPER::process(@_);
}

# currently copied from pagedimageplug
sub process_text {
    my $self = shift(@_);
    my ($filename_full_path, $doc_obj, $cursection) = @_;

    # check that the text file exists!!
    if (!-f $filename_full_path) {
	print "TesseractImagePlugin: ERROR: File $filename_full_path does not exist, skipping\n";
	return 0;
    }

    # Do encoding stuff
    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);

    my $text="";
    &ReadTextFile::read_file($self, $filename_full_path, $encoding, $language, \$text); # already decoded as utf8
    if (!length ($text)) {
	# It's a bit unusual but not out of the question to have no text, so just give a warning
        print "TesseractImagePlugin: WARNING: $filename_full_path contains no text\n";
    }

    # we need to escape the escape character, or else mg will convert into
    # eg literal newlines, instead of leaving the text as '\n'
    $text =~ s/\\/\\\\/g; # macro language
    $text =~ s/_/\\_/g; # macro language

    $text= &util::trim($text);
    if ($text =~ m/<html.*?>\s*<head.*?>.*<\/head>\s*<body.*?>(.*)<\/body>\s*<\/html>\s*$/is) {
	# looks like HTML input
	# no need to escape < and > or put in <pre> tags

	$text = $1;

	# add text to document object
	$doc_obj->add_utf8_text($cursection, "$text");
    }
    else {
	$text =~ s/</&lt;/g;
	$text =~ s/>/&gt;/g;

	# insert preformat tags and add text to document object
	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
    }

    
    return 1;
}

sub clean_up_after_doc_obj_processing {
    my $self = shift(@_);

    $self->SUPER::clean_up_after_doc_obj_processing();
    $self->TesseractTextExtractor::clean_up_temporary_files();
}

1;





