###########################################################################
#
# MalwareCheckerConverter - allows files to be scanned for malware through VirusTotal API
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 2010 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
package MalwareCheckerConverter;

use BaseMediaConverter;

use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa
no strict 'subs'; # allow barewords (eg STDERR) as function arguments

#use HTML::Entities; # for encoding characters into their HTML entities when PDFBox converts to text

use gsprintf 'gsprintf';
use FileUtils;

# these two variables mustn't be initialised here or they will get stuck 
# at those values.
our $malwarechecker_conversion_available; 
our $no_malwarechecker_conversion_reason; 

BEGIN {
    @MalwareCheckerConverter::ISA = ('BaseMediaConverter');

    # Check that MalwareChecker is installed and available on the path 
    $malwarechecker_conversion_available = 1;
    $no_malwarechecker_conversion_reason = "";
    
    if (!defined $ENV{'GEXT_MALWARECHECKER'}) {
	$malwarechecker_conversion_available = 0;
	$no_malwarechecker_conversion_reason = "gextpdfboxnotinstalled";
    }
    else {
	my $gextmw_home = $ENV{'GEXT_MALWARECHECKER'};
	my $pbajar = &FileUtils::filenameConcatenate($gextmw_home,"lib","java","malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar");

	if (!-e $pbajar) {
	    &gsprintf(STDERR,"**** Failed to find $pbajar\n");
	    $malwarechecker_conversion_available = 0;
	    $no_malwarechecker_conversion_reason = "gextmalwherecheckerjarnotinstalled";
	}
	else {
	    # test to see if java is in path
	    # Need to run java -version instead of just java, since the %ERRORLEVEL% returned
	    # for `java` (which is checked below for failure of the command) is 0 for JDK 1.6*
	    # while %ERRORLEVEL% is 1 for JDK 1.7*
	    # If `java -version` is run however, %ERRORLEVEL% returned is 0 if java is 
	    # installed, regardless of whether the JDK version is 1.6* or 1.7*.
	    my $java = &util::get_java_command();
			
	    my $cmd = "$java -version";
	    if ($ENV{'GSDLOS'} =~ /^windows/i) {
		$cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
	    }
	    else {
		# On Ubuntu, java >/dev/null 2>&1 works, 
		# but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
		$cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it)
	    }

	    my $status = system($cmd);

	    if ($status != 0) {
	       
		my $error_message =  "**** Testing for java\n";
		$error_message .= "Failed to run: $cmd\n";
		$error_message .=  "Error variable: |$!| and status: $status\n";

		&gsprintf(STDERR, "MalwareCheckerConverter: $error_message");

		$malwarechecker_conversion_available = 0;
		$no_malwarechecker_conversion_reason = "couldnotrunjava";
	    } 
	}
    }

}

my $arguments = [ ];

my $options = { 'name' => "MalwareCheckerConverter",
		'desc' => "{MalwareCheckerConverter.desc}",
		'abstract' => "yes",
		'inherits' => "yes",
		'args' => $arguments };

sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists,$auxilary) = @_;
    push(@$pluginlist, $class);

    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
    push(@{$hashArgOptLists->{"OptList"}},$options);


    my $self = new BaseMediaConverter($pluginlist, $inputargs, 
				      $hashArgOptLists, $auxilary);

    if ($self->{'info_only'}) {
	# don't worry about any options etc
	return bless $self, $class;
    }
    if ($malwarechecker_conversion_available) {
	my $gextmw_home = $ENV{'GEXT_MALWARECHECKER'};
	my $pbajar = &FileUtils::filenameConcatenate($gextmw_home,"lib","java","pdfbox-app.jar");
	my $pbjbigjar = &FileUtils::filenameConcatenate($gextmw_home,"lib","java","jbig2-imageio-3.0.1.jar");
	# Not including the following JPEG2000 jar, as it is under commercial license:
	# https://github.com/jai-imageio/jai-imageio-jpeg2000 leading to https://bintray.com/jai-imageio/maven/jai-imageio-jpeg2000# (Files tab)
	# my $pbjp2jar = &FileUtils::filenameConcatenate($gextmw_home,"lib","java","jai-imageio-jpeg2000-1.3.0.jar"); # jpeg2000
	my $java = &util::get_java_command();
	$self->{'malwarechecker_txt_launch_cmd'} = "$java -cp \"$pbajar\" org.apache.pdfbox.tools.ExtractText";
	$self->{'malwarechecker_html_launch_cmd'} = "$java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.tools.ExtractText";

	# We use this next cmd to launch our new custom PDFBox class (PDFBoxToImagesAndText.java) to convert each PDF page into an image (gif, jpg, png)
	# AND its extracted text. Or just each page's extracted text. An item file is still generated,
	# but this time referring to txtfiles too, not just the images. Result: searchable paged output.
	# Our new custom class PDFBoxToImagesAndText.java lives in the new build folder, so add that to the classpath for the launch cmd
	my $malwarechecker_build = &FileUtils::filenameConcatenate($gextmw_home,"build");
	# put the pdfbox jar, the jbig2-imageio library (Apache Software License 2.0)
	# and our build folder containing our custom PDFBox class on the classpath
	my $classpath = &util::pathname_cat($pbajar, $pbjbigjar, $malwarechecker_build);
#	$self->{'malwarechecker_img_launch_cmd'} = "java -cp \"$classpath\" org.apache.pdfbox.tools.PDFToImage"; # pdfbox 2.09 cmd for converting each PDF page to an image (jpg, png)	
	$self->{'malwarechecker_imgtxt_launch_cmd'} = "java -cp \"$classpath\" org.greenstone.pdfbox.PDFBoxToImagesAndText";
    }
    else {       
	$self->{'no_malwarechecker_conversion_reason'} = $no_malwarechecker_conversion_reason;

	my $outhandle = $self->{'outhandle'};
	&gsprintf($outhandle, "PDFBoxConverter: {PDFBoxConverter.noconversionavailable} ({PDFBoxConverter.$no_malwarechecker_conversion_reason})\n");
    }  

    print STDERR "**** malware check avail = $malwarechecker_conversion_available\n";
    
    $self->{'malwarechecker_conversion_available'} = $malwarechecker_conversion_available;
    
    return bless $self, $class;

}

sub init {
    print STDERR "******* MalwareCheckerConverer init() !!!!!!\n";
    my $self = shift(@_);
    my ($verbosity, $outhandle, $failhandle) = @_;

    $self->{'pbtmp_file_paths'} = ();

    # *****
    $self->CommonUtil::init(@_); # need verbosity set in $self
    # $self->{'verbosity'} = $verbosity;
}

sub deinit {
    my $self = shift(@_);

    $self->clean_up_temporary_files();
}


sub checker {
    my $self = shift(@_);
    my ($doc_obj, $source_file_full_path) = @_;

    print STDERR "***** @@@@@@ !!!!!!! MalwareCheckerConvert::convert called!\n";


    print STDERR "**** want to check: $source_file_full_path\n";

    my $virustotal_apikey = $self->{'virustotal_apikey'};
  
    # Build up command along the lines:
    #   java -cp $ENV{'GEXT_MALWARECHECK'}/lib/java/malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar org.greenstone.virustotal.ScanFile

    my $jar_file = &FileUtils::filenameConcatenate($ENV{'GEXT_MALWARECHECKER'},"lib","java","malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar");
	    
    my $checker_cmd = "java -cp $jar_file org.greenstone.virustotal.ScanFile $virustotal_apikey $source_file_full_path";
    
    my $print_info = { 'message_prefix' => "MalwareChecker Conversion",
		       'message' => "Checking $source_file_full_path" };
    # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");

    #my $target_file_path = ".virustotal-resourceid-" + $source_file_full_path;
    my $target_file_path = $source_file_full_path . "-VIRUSTOTAL-RESOURCEID";
    
    my ($regenerated,$result,$had_error) 
	= $self->autorun_general_cmd($checker_cmd,$source_file_full_path,$target_file_path,$print_info);

    if ($had_error) {
	print STDERR "Error: Failed to run cmd: $checker_cmd\n";
    }
    else {
	chomp($result);
	my $top_section = $doc_obj->get_top_section();
	$doc_obj->add_utf8_metadata($top_section, "VirusTotalResourceID", $result);
    }

    # Need to regulate how often we make calls to VirusTotal API
    # If public key, then 4 per min

    if ($self->{'virustotal_keytype'} eq "public") {
###	if ($regenerated) {
	    my $verbosity = $self->{'verbosity'};
	    if ($verbosity >= 1) {
		my $outhandle = $self->{'outhandle'};
		print $outhandle "Sleeping for 15 secs, in accordance with public key VirusTotal API Terms and Conditions\n";
	    }
	    sleep(15);
###	}
    }       
}

sub clean_up_temporary_files {
    my $self = shift(@_);

    foreach my $pbtmp_file_path (@{$self->{'pbtmp_file_paths'}}) {
	if (-d $pbtmp_file_path) {
	    #print STDERR "@@@@@@ cleanup called on $pbtmp_file_path\n";
	    &FileUtils::removeFilesRecursive($pbtmp_file_path);
	}
	elsif (-e $pbtmp_file_path) {
	    &FileUtils::removeFiles($pbtmp_file_path);
	}
    }

    $self->{'pbtmp_file_paths'} = ();
}


1;
