#!/usr/bin/perl -w

#    Kea
#    Version 1.1.4

#    Kea -- Automatic Keyphrase Extraction
#    Copyright 1998-1999 by Gordon Paynter and Eibe Frank
#    Contact gwp@cs.waikato.ac.nz or eibe@cs.waikato.ac.nz
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

# Version history
#
# 1.0   Witten et.al.
# 1.0.1 Bug: stopword file loaded as model file
# 1.0.2 Java paths explicit for nikau; JIT compiler
# 1.0.3 Include tf.idf in output if -t is set
# 1.0.4 Allow optional keyphrase frequency file
# 1.0.5 Use $perl_command and $java_command for system-indepence
# 1.0.6 -C argument selects model, stopword file, df file 
# 1.0.7 Changes to Kea.pl.
#       This is Phillip's version
# 1.0.8 Accepts .htm as well as .html
# 1.0.9 Accepts .text as well as .txt
# 1.1   First Distribution.  GPL added.  Documentation added.
# 1.1.1 -E argument sets output extension (default is .kea)
# 1.1.2 Documented java variables
#       Maximum phrase length can be set at command-line.
#       Note: default=3; NOT the length for the model. Sorry.
# 1.1.3 Moved Lynx command into separate script that checks
#       for circumstances that are likely to crash it.
# 1.1.4 Updated documentation and added a few extra files.

print STDERR "\nKea (version 1.1.4): automatic keyphrase extraction\n";

$gsdlhome = $ENV{'GSDLHOME'};

`gcc -o $gsdlhome/perllib/Kea-1.1.4/stemmer $gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem-Lovins-iterated.c $gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem.c`;


# Kea runs other perl scripts in shells.
$perl_command = "perl -w";

# Java is a bit more difficult.
# set this variable to your java home directory
#$java_home = "/usr/local/jdk";
$java_home = "/usr/local/share/java";

# this variable will hold the CLASSPATH for java which we set at 
# the command line to incorporate jaws.jar; you shouldn't need to change it
$java_classpath = ".:$gsdlhome/perllib/Kea-1.1.4/jaws.jar:$gsdlhome/perllib/Kea-1.1.4:$java_home/lib/classes.zip";

# The name of your java just-in-time compiler.  I use TYA.
# An empty string means you don;t have a JIT compiler.
$java_JIT_compiler = "";
#$java_JIT_compiler = "tya";

# If you want to give java lots of memory or use other arguments, 
# use this variable, otherwise make it an empty string.
$java_extra_args = "";
$java_extra_args = "-ss100000000 -oss100000000 -mx200000000";

# The actual java command is based on these other variables:
$java_command = "$java_home/bin/java -classpath \"$java_classpath\"";
$java_command .= " -Djava.compiler=$java_JIT_compiler" if ($java_JIT_compiler);
$java_command .= " $java_extra_args" if ($java_extra_args);


# Parse command line options
require("getopts.pl"); 
&Getopts("dtN:E:C:F:K:L:M:S:");

# What files shall we use?
if (!$ARGV[0]) { 
    die "Usage: Kea [options] text-or-html-or-cstr-files
Options:
 -d           Debug mode
 -t           Ouput TF.IDF
 -N n         Output n keyphrases
 -L n         Maximum phrases length is n (default = 3)
 -E <suffix>  Output extension is <suffix>
 -C <corpus>  Use model/df/kf/stopwords based on <corpus>  
 -F <document-frequency file>
 -K <keyphrase-frequency file>
 -M <Naive-Bayes model file>
 -S <stopword file>
See README for more detail.
";
}


# Number of phrases to extract
if (($opt_N) && ($opt_N =~ /^\d+$/) && ($opt_N > 0)) {
    $number_of_phrases = "-N $opt_N";
    print STDERR "Number of phrases to extract: $opt_N\n";
} elsif ($opt_N) {
    die "Kea cannot understand -N argument (must be a number): $opt_N\n";
} else {
    print STDERR "Number of phrases dictated by model (default)\n";
    $number_of_phrases = "";
}

# -L maximum phrase length
$maximum_phrase_length = 0;
if (($opt_L) && ($opt_L =~ /^\d+$/) && ($opt_L > 0)) {
    $maximum_phrase_length = "$opt_L";
    print STDERR "Maximum phrase length: $opt_L\n";
} elsif ($opt_L) {
    die "Kea cannot understand -L argument (must be a number): $opt_L\n";
}

# -E What extension shall we use?
$extension = "kea";
if ($opt_E) {
    $extension = $opt_E;
    $extension =~ s/^\.//g;
    print STDERR "Using output extension: $extension\n";
}

# -t Should we output tfidf?  (This option is used by Kniles.)
if ($opt_t && $opt_t) {
    $output_tfidf = 1;
    print STDERR "Do print tf.idf\n";
} else {
    $output_tfidf = 0;
}


# -C Corpus file stem 
$default_stem = "$gsdlhome/perllib/Kea-1.1.4/aliweb";
if ($opt_C) {
    print STDERR "Corpus: $opt_C (setting default model/stopwords/df)\n";
    $default_stem = "$gsdlhome/perllib/Kea-1.1.4/$opt_C";
}

# set the default model etc
$model_file = "$default_stem.model";
$stopword_file = "$default_stem.stopwords";
$frequency_file = "$default_stem.df";
$keyword_frequency_file = "";
$keyword_frequency_file = "$default_stem.kf" if (-e "$default_stem.kf");


# -F Document Frequency file
$frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_F" if ($opt_F);
print STDERR "Document frequency file: $frequency_file\n";
die "Document frequency file does not exist!\n" if (!(-e $frequency_file));

# -M Model file
$model_file =  "$gsdlhome/perllib/Kea-1.1.4/$opt_M" if ($opt_M);
print STDERR "Model file: $model_file\n";
die "Model file does not exist!\n" if (!(-e $model_file));

# -S Stopword file
$stopword_file =  "$gsdlhome/perllib/Kea-1.1.4/$opt_S" if ($opt_S);
print STDERR "Stopword file: $stopword_file\n";
die "Stopword file does not exist!\n" if (!(-e $stopword_file));

# -K Keyword frequency file
$keyword_frequency_file =  "$gsdlhome/perllib/Kea-1.1.4/$opt_K" if ($opt_K);
if ($keyword_frequency_file) {
    print STDERR "Keyword frequency file: $keyword_frequency_file\n";
    die "Keyword frequency file does not exist!\n" if (!(-e $keyword_frequency_file));
} else {
    print STDERR "No keyword frequency file (default)\n";
}

# Count the number of files
$number_of_files = $#ARGV + 1;
print STDERR "Number of files: $number_of_files\n\n";


# Set up working files

$stem = "$gsdlhome/tmp/kea.$$";

$data = "$stem.data";
$arff = "$stem.arff";
$out = "$stem.out";
$err = "$stem.err";

`mkdir -m 777 $data`;


# Process each input file into a working file
print STDERR "Preparing input files in: $data\n";

for ($f = 0; $f <= $#ARGV; $f++) {
    print STDERR "  document ", ($f+1), ": $ARGV[$f]\r";

    $file = $ARGV[$f];
    $temp = "$data/$f";
    $original_filename{$temp} = $file;

    # copy the file to the data directory & coerce into a clause file
    if ($file =~ /.*\.clauses/) {
 	`cp $file $temp.clauses`;
    } else {
  	if ($file =~ /.*\.te?xt/) {
	    `cp $file $temp.txt`;
	} elsif ($file =~ /.*\.html?/i) {
	    `cp $file $temp.html`;
	    `$perl_command $gsdlhome/perllib/Kea-1.1.4/convert-html-to-text.pl $temp.html > $temp.txt`;
	} elsif ($file =~ /.*\.cstr/) {
	    `cp $file $temp.cstr`;
	    `$perl_command $gsdlhome/perllib/Kea-1.1.4/cstr-to-text.pl $temp.cstr $temp.txt`;
	} else {
	    die "Unknown file type: $file\n";
	}
	# prepare the file
	`$perl_command $gsdlhome/perllib/Kea-1.1.4/prepare-clauses.pl $temp.txt $temp.clauses`;
    }
}
#print STDERR "\n\n";

# Build the arff file
$command = "$perl_command $gsdlhome/perllib/Kea-1.1.4/k4.pl -S $stopword_file -f $frequency_file";
$command .= " -K $keyword_frequency_file" if ($keyword_frequency_file);
$command .= " -L $maximum_phrase_length" if ($maximum_phrase_length);
$command .= " $data $arff";

#print STDERR "** $command **\n";
`$command`;


# use KEP.java to extract phrases
$command = "$java_command KEP -m $model_file -T $arff";
$command .= " $number_of_phrases" if ($number_of_phrases);
# -R is correctly set by default in the java file when the model is loaded
# $command .= " -R tfidf,first_occurrence,keyword_freq,class" if ($keyword_frequency_file);
$command .= " > $out 2> $err";

print STDERR "** $command  **\n";
`$command`;


# Read output file and create .kea files

open(KEP, "<$out");

$document = "";

while (<KEP>) {

    chomp;
    $line = $_;
    
    # new document
    if ($line =~ "^Current document") {
	# close the old document
	if ($document) {
	    close(DOC);
	}
	# open start the new document
	($doc) = $line =~ /Current document: (.*)\.clauses$/;
	$document = $original_filename{$doc};
	$document =~ s/\.[^\.]+$//;
	$document .= ".$extension";
	#print STDERR "Opening DOC: $original_filename{$doc} => $document\n";
	open(DOC, ">$document");

    } elsif ($line =~ "^Miss:") {
	die "Trying to write with no current document!" if (!$document);
	($phrase, $tfidf, $evidence) = $line =~ /Miss: ([^,]+).*,([^,]+),no (.+)$/;
	($stemmed, $unstemmed) = $phrase =~ /\'(.+) \((.+)\)\'/;
	if ($output_tfidf) {
	    print DOC "$unstemmed\t$stemmed\t$evidence\t$tfidf\n";       
	} else {
	    print DOC "$unstemmed\t$stemmed\t$evidence\n";       
	}
    }
}


#get rid of temporary files
#if (!$opt_d && !$opt_d) {
#  `rm -r $data $arff $out $err`;
#}