###########################################################################
#
# ngramj.pm -- Identify the language of a piece of text
#
#
# This file is based on TextCat version 1.08 by Gertjan van Noord
# Copyright (C) 1997 Gertjan van Noord (vannoord@let.rug.nl)
# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat 
#
# It was modified by Gordon Paynter (gwp@cs.waikato.ac.nz) and turned
# into a package for use in Greenstone digital library system.  Most of
# the modifications consist of commenting out or deleting functionality
# I don't need.  
#
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package ngramj;

use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa

sub new {
    my $class = shift (@_);
	my ($verbosity,$outhandle) = @_;
	
    my $self = { 'verbosity' => $verbosity, 'outhandle' => $outhandle };

	my $ngram_jar = &util::filename_cat($ENV{'GSDLHOME'},"ext","ngramj","jars","cngram.jar");
	
	my $java_cmd = "java -jar $ngram_jar";
	
	$self->{'java_cmd'} = $java_cmd;
  
    return bless $self, $class;
}


sub classify_contents {
    my ($self, $contents_ref, $filename, $opt_encoding)=@_;
     
	 # save contents_ref to tmp file
	 
	 my $tmp_txt_filename = &util::get_tmp_filename("txt");
	 
	 if (open(TOUT,">$tmp_txt_filename")) {
	 
		binmode(TOUT,":utf8");
		print TOUT $$contents_ref;
		close(TOUT);
	 }
	 else {
		print STDERR "Failed to open $tmp_txt_filename\n";
		print STDERR "$!\n";
		return undef;
	}
		
	 # run java code over it
	 my $java_cmd = $self->{'java_cmd'};
	 $java_cmd .= " -lang2 $tmp_txt_filename";
	 $java_cmd .= " $opt_encoding" if (defined $opt_encoding);
	 
	 my $lang_encode_pairs = [];
	 if (open(NGRAMIN,"$java_cmd |")) {
		
		 # parse what comes back and turn into array of 'lang-encode' form
		 
		my $line;
		while (defined ($line = <NGRAMIN>)) {
			if ($line =~ m/^\s*speed:\s*(.*?)\s*\.\./) {
				my $lang_group = $1;
				
				my @lang_array = split(/\s+/,$lang_group);
				
				my @lang_summary = ( "++ Ngram language probabilities:\n++  ");
				
				foreach my $l (@lang_array) {
					push(@lang_summary,$l);
					my ($lang,$score) = ($l =~ m/^(.+):(.+)$/);
					
					my $lang_pair = $lang;
					$lang_pair .= "-$opt_encoding" if (defined $opt_encoding);
					
					push(@$lang_encode_pairs,$lang_pair);
				}
				push(@lang_summary,"\n");
				
				if ($self->{'verbosity'}>=2) {
					my $outhandle = $self->{'outhandle'};
					my $lang_summary_str = join(" ",@lang_summary);
					print $outhandle $lang_summary_str;
				}
			}
		}
		
	 }
	else {
		print STDERR "Failed to open pipe to $java_cmd\n";
		print STDERR "$!\n";
		return undef;
	}
   
    &util::rm($tmp_txt_filename);
	
    # return cached array of content encodings for the given filename
    return $lang_encode_pairs
}


sub classify_contents_for_encoding {
    my ($self, $contents_ref, $filename, $filter_by_encoding)=@_;

	return $self->classify_contents($contents_ref,$filename,$filter_by_encoding);
}
   


1;
