#!/usr/bin/perl -w # Kea # Version 1.1.4 # Kea -- Automatic Keyphrase Extraction # Copyright 1998-1999 by Gordon Paynter and Eibe Frank # Contact gwp@cs.waikato.ac.nz or eibe@cs.waikato.ac.nz # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # Version history # # 1.0 Witten et.al. # 1.0.1 Bug: stopword file loaded as model file # 1.0.2 Java paths explicit for nikau; JIT compiler # 1.0.3 Include tf.idf in output if -t is set # 1.0.4 Allow optional keyphrase frequency file # 1.0.5 Use $perl_command and $java_command for system-indepence # 1.0.6 -C argument selects model, stopword file, df file # 1.0.7 Changes to Kea.pl. # This is Phillip's version # 1.0.8 Accepts .htm as well as .html # 1.0.9 Accepts .text as well as .txt # 1.1 First Distribution. GPL added. Documentation added. # 1.1.1 -E argument sets output extension (default is .kea) # 1.1.2 Documented java variables # Maximum phrase length can be set at command-line. # Note: default=3; NOT the length for the model. Sorry. # 1.1.3 Moved Lynx command into separate script that checks # for circumstances that are likely to crash it. # 1.1.4 Updated documentation and added a few extra files. print STDERR "\nKea (version 1.1.4): automatic keyphrase extraction\n"; $gsdlhome = $ENV{'GSDLHOME'}; `gcc -o $gsdlhome/perllib/Kea-1.1.4/stemmer $gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem-Lovins-iterated.c $gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem.c`; # Kea runs other perl scripts in shells. $perl_command = "perl -w"; # Java is a bit more difficult. # set this variable to your java home directory #$java_home = "/usr/local/jdk"; $java_home = "/usr/local/share/java"; # this variable will hold the CLASSPATH for java which we set at # the command line to incorporate jaws.jar; you shouldn't need to change it $java_classpath = ".:$gsdlhome/perllib/Kea-1.1.4/jaws.jar:$gsdlhome/perllib/Kea-1.1.4:$java_home/lib/classes.zip"; # The name of your java just-in-time compiler. I use TYA. # An empty string means you don;t have a JIT compiler. $java_JIT_compiler = ""; #$java_JIT_compiler = "tya"; # If you want to give java lots of memory or use other arguments, # use this variable, otherwise make it an empty string. $java_extra_args = ""; $java_extra_args = "-ss100000000 -oss100000000 -mx200000000"; # The actual java command is based on these other variables: $java_command = "$java_home/bin/java -classpath \"$java_classpath\""; $java_command .= " -Djava.compiler=$java_JIT_compiler" if ($java_JIT_compiler); $java_command .= " $java_extra_args" if ($java_extra_args); # Parse command line options require("getopts.pl"); &Getopts("dtN:E:C:F:K:L:M:S:"); # What files shall we use? if (!$ARGV[0]) { die "Usage: Kea [options] text-or-html-or-cstr-files Options: -d Debug mode -t Ouput TF.IDF -N n Output n keyphrases -L n Maximum phrases length is n (default = 3) -E Output extension is -C Use model/df/kf/stopwords based on -F -K -M -S See README for more detail. "; } # Number of phrases to extract if (($opt_N) && ($opt_N =~ /^\d+$/) && ($opt_N > 0)) { $number_of_phrases = "-N $opt_N"; print STDERR "Number of phrases to extract: $opt_N\n"; } elsif ($opt_N) { die "Kea cannot understand -N argument (must be a number): $opt_N\n"; } else { print STDERR "Number of phrases dictated by model (default)\n"; $number_of_phrases = ""; } # -L maximum phrase length $maximum_phrase_length = 0; if (($opt_L) && ($opt_L =~ /^\d+$/) && ($opt_L > 0)) { $maximum_phrase_length = "$opt_L"; print STDERR "Maximum phrase length: $opt_L\n"; } elsif ($opt_L) { die "Kea cannot understand -L argument (must be a number): $opt_L\n"; } # -E What extension shall we use? $extension = "kea"; if ($opt_E) { $extension = $opt_E; $extension =~ s/^\.//g; print STDERR "Using output extension: $extension\n"; } # -t Should we output tfidf? (This option is used by Kniles.) if ($opt_t && $opt_t) { $output_tfidf = 1; print STDERR "Do print tf.idf\n"; } else { $output_tfidf = 0; } # -C Corpus file stem $default_stem = "$gsdlhome/perllib/Kea-1.1.4/aliweb"; if ($opt_C) { print STDERR "Corpus: $opt_C (setting default model/stopwords/df)\n"; $default_stem = "$gsdlhome/perllib/Kea-1.1.4/$opt_C"; } # set the default model etc $model_file = "$default_stem.model"; $stopword_file = "$default_stem.stopwords"; $frequency_file = "$default_stem.df"; $keyword_frequency_file = ""; $keyword_frequency_file = "$default_stem.kf" if (-e "$default_stem.kf"); # -F Document Frequency file $frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_F" if ($opt_F); print STDERR "Document frequency file: $frequency_file\n"; die "Document frequency file does not exist!\n" if (!(-e $frequency_file)); # -M Model file $model_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_M" if ($opt_M); print STDERR "Model file: $model_file\n"; die "Model file does not exist!\n" if (!(-e $model_file)); # -S Stopword file $stopword_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_S" if ($opt_S); print STDERR "Stopword file: $stopword_file\n"; die "Stopword file does not exist!\n" if (!(-e $stopword_file)); # -K Keyword frequency file $keyword_frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_K" if ($opt_K); if ($keyword_frequency_file) { print STDERR "Keyword frequency file: $keyword_frequency_file\n"; die "Keyword frequency file does not exist!\n" if (!(-e $keyword_frequency_file)); } else { print STDERR "No keyword frequency file (default)\n"; } # Count the number of files $number_of_files = $#ARGV + 1; print STDERR "Number of files: $number_of_files\n\n"; # Set up working files $stem = "$gsdlhome/tmp/kea.$$"; $data = "$stem.data"; $arff = "$stem.arff"; $out = "$stem.out"; $err = "$stem.err"; `mkdir -m 777 $data`; # Process each input file into a working file print STDERR "Preparing input files in: $data\n"; for ($f = 0; $f <= $#ARGV; $f++) { print STDERR " document ", ($f+1), ": $ARGV[$f]\r"; $file = $ARGV[$f]; $temp = "$data/$f"; $original_filename{$temp} = $file; # copy the file to the data directory & coerce into a clause file if ($file =~ /.*\.clauses/) { `cp $file $temp.clauses`; } else { if ($file =~ /.*\.te?xt/) { `cp $file $temp.txt`; } elsif ($file =~ /.*\.html?/i) { `cp $file $temp.html`; `$perl_command $gsdlhome/perllib/Kea-1.1.4/convert-html-to-text.pl $temp.html > $temp.txt`; } elsif ($file =~ /.*\.cstr/) { `cp $file $temp.cstr`; `$perl_command $gsdlhome/perllib/Kea-1.1.4/cstr-to-text.pl $temp.cstr $temp.txt`; } else { die "Unknown file type: $file\n"; } # prepare the file `$perl_command $gsdlhome/perllib/Kea-1.1.4/prepare-clauses.pl $temp.txt $temp.clauses`; } } #print STDERR "\n\n"; # Build the arff file $command = "$perl_command $gsdlhome/perllib/Kea-1.1.4/k4.pl -S $stopword_file -f $frequency_file"; $command .= " -K $keyword_frequency_file" if ($keyword_frequency_file); $command .= " -L $maximum_phrase_length" if ($maximum_phrase_length); $command .= " $data $arff"; #print STDERR "** $command **\n"; `$command`; # use KEP.java to extract phrases $command = "$java_command KEP -m $model_file -T $arff"; $command .= " $number_of_phrases" if ($number_of_phrases); # -R is correctly set by default in the java file when the model is loaded # $command .= " -R tfidf,first_occurrence,keyword_freq,class" if ($keyword_frequency_file); $command .= " > $out 2> $err"; print STDERR "** $command **\n"; `$command`; # Read output file and create .kea files open(KEP, "<$out"); $document = ""; while () { chomp; $line = $_; # new document if ($line =~ "^Current document") { # close the old document if ($document) { close(DOC); } # open start the new document ($doc) = $line =~ /Current document: (.*)\.clauses$/; $document = $original_filename{$doc}; $document =~ s/\.[^\.]+$//; $document .= ".$extension"; #print STDERR "Opening DOC: $original_filename{$doc} => $document\n"; open(DOC, ">$document"); } elsif ($line =~ "^Miss:") { die "Trying to write with no current document!" if (!$document); ($phrase, $tfidf, $evidence) = $line =~ /Miss: ([^,]+).*,([^,]+),no (.+)$/; ($stemmed, $unstemmed) = $phrase =~ /\'(.+) \((.+)\)\'/; if ($output_tfidf) { print DOC "$unstemmed\t$stemmed\t$evidence\t$tfidf\n"; } else { print DOC "$unstemmed\t$stemmed\t$evidence\n"; } } } #get rid of temporary files #if (!$opt_d && !$opt_d) { # `rm -r $data $arff $out $err`; #}