###########################################################################
#
# TikaConverter - helper plugin that does various types of document
#                 conversion with Apache Tika
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 2010 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
package TikaConverter;

use BaseMediaConverter;

use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa

use gsprintf 'gsprintf';

# these two variables mustn't be initialised here or they will get stuck 
# at those values.
our $tika_conversion_available; 
our $no_tika_conversion_reason; 

BEGIN {
    @TikaConverter::ISA = ('BaseMediaConverter');

    # Check that Tika is installed and available on the path 
    $tika_conversion_available = 1;
    $no_tika_conversion_reason = "";
    
    if (!defined $ENV{'GEXT_TIKA'}) {
	$tika_conversion_available = 0;
	$no_tika_conversion_reason = "gexttikanotinstalled";
    }
    else {
	my $gextpb_home = $ENV{'GEXT_TIKA'};
	my $pbajar = &util::filename_cat($gextpb_home,"lib","java","tika-app.jar");

	if (!-e $pbajar) {
	    print STDERR "Failed to find $pbajar\n";
	    $tika_conversion_available = 0;
	    $no_tika_conversion_reason = "gexttikajarnotinstalled";
	}
	else {
	    # test to see if java is in path	 
	    my $cmd = "java 2>&1";
	    if ($ENV{'GSDLOS'} =~ /^windows/i) {
		$cmd .= " >nul";
	    }
	    else {
		$cmd .= " >/dev/null";
	    }

	    my $status = system($cmd);
	    if ($status != 0) {
		print STDERR "Testing for java\n";
		print STDERR "Failed to run: $cmd\n";
		print STDERR "$!\n";
		$tika_conversion_available = 0;
		$no_tika_conversion_reason = "tikanotinstalled";
	    } 
	}
    }

}

my $arguments = [ ];

my $options = { 'name' => "TikaConverter",
		'desc' => "{TikaConverter.desc}",
		'abstract' => "yes",
		'inherits' => "yes",
		'args' => $arguments };

sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
    push(@{$hashArgOptLists->{"OptList"}},$options);

    my $self = new BaseMediaConverter($pluginlist, $inputargs, $hashArgOptLists);

    if ($self->{'info_only'}) {
	# don't worry about any options etc
	return bless $self, $class;
    }
    if ($tika_conversion_available) {
	my $gextpb_home = $ENV{'GEXT_TIKA'};
	my $pbajar = &util::filename_cat($gextpb_home,"lib","java","tika-app.jar");
	my $launch_cmd = "";
	
	$self->{'tika_launch_cmd'} = $launch_cmd;
    }
    else {       
	$self->{'no_tika_conversion_reason'} = $no_tika_conversion_reason;

	my $outhandle = $self->{'outhandle'};
	&gsprintf($outhandle, "TikaConverter: {TikaConverter.noconversionavailable} ({TikaConverter.$no_tika_conversion_reason})\n");
    }  

    $self->{'tika_conversion_available'} = $tika_conversion_available;
    
    return bless $self, $class;

}

sub init {
    my $self = shift(@_);
    my ($verbosity, $outhandle, $failhandle) = @_;

    $self->{'ttmp_file_paths'} = ();
}

sub deinit {
    my $self = shift(@_);

    $self->clean_up_temporary_files();
}


sub convert {
    my $self = shift(@_);
    my ($source_file_full_path, $target_file_type) = @_;

    return 0 unless $tika_conversion_available;
    # check the filename
    return 0 if ( !-f $source_file_full_path);

    my $source_file_no_path = &File::Basename::basename($source_file_full_path);
    # Determine the full name and path of the output file
    my $target_file_path;
    if ($self->{'enable_cache'}) {
	$self->init_cache_for_file($source_file_full_path);
	my $cache_dir = $self->{'cached_dir'};
	my $file_root = $self->{'cached_file_root'};
	#$file_root .= "_$convert_id" if ($convert_id ne "");
	my $target_file = "$file_root.$target_file_type";
	$target_file_path = &util::filename_cat($cache_dir,$target_file);
    }
    else {
	# this is in gsdl/tmp. get a tmp filename in collection instead???
	$target_file_path = &util::get_tmp_filename($target_file_type);
	push(@{$self->{'ttmp_file_paths'}}, $target_file_path);
    }

    # Generate and run the convert command
    my $convert_cmd = $self->{'launch_cmd'};
    $convert_cmd .= " -html" if ($target_file_type eq "html");
    my $convert_command .= " \"$source_file_full_path\" \"$target_file_path\"";

    my $print_info = { 'message_prefix' => "Tika Conversion",
		       'message' => "Converting $source_file_no_path to: $target_file_type" };
    # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");

    my ($regenerated,$result,$had_error) 
	= $self->autorun_general_cmd($convert_command,$source_file_full_path, $target_file_path,$print_info);
    if ($had_error) {
	return (0, $result,$target_file_path);
    }
    return (1, $result,$target_file_path);
}

sub convert_without_result {
    my $self = shift(@_);

    my $source_file_path = shift(@_);
    my $target_file_type = shift(@_);
    my $convert_options  = shift(@_) || "";
    my $convert_id       = shift(@_) || "";

    return $self->convert($source_file_path,$target_file_type,
			  $convert_options,$convert_id,"without_result");
}

sub clean_up_temporary_files {
    my $self = shift(@_);

    foreach my $ttmp_file_path (@{$self->{'ttmp_file_paths'}}) {
	if (-e $ttmp_file_path) {
	    &util::rm($ttmp_file_path);
	}
    }

    $self->{'ttmp_file_paths'} = ();
}



1;	
