#!/usr/bin/perl -w

###########################################################################
#
# Enhance the Terrier FileIndexer component with parallel processing
# capability.
#
# Copyright (C) 2012 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package parallel_terrier_fileindexer;

BEGIN
{
  die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
  die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
  die "GSDL Extensions not enabled\n" unless defined $ENV{'GSDLEXTS'};
  unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
  unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
  unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
  unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");

  my $found_parallel_building_ext = 0;
  my @extensions = split(/:/,$ENV{'GSDLEXTS'});
  foreach my $e (@extensions)
  {
    if ($e eq 'parallel-building')
    {
      $found_parallel_building_ext = 1;
    }
    my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";  
    unshift (@INC, "$ext_prefix/perllib");
    unshift (@INC, "$ext_prefix/perllib/cpan");
    unshift (@INC, "$ext_prefix/perllib/plugins");
    unshift (@INC, "$ext_prefix/perllib/plugouts");
  }
  if (0 == $found_parallel_building_ext)
  {
    die "GSDL Parallel Building Extension not installed\n";
  }
}

use strict;
use warnings;

# /** @function debugPrint
#  */
sub debugPrint
{
  my ($debug, $message) = @_;
  if ($debug)
  {
    print STDERR '[SDEBUG] ' . $message;
  }
}
# /** debugPrint(boolean, String) **/

# /** @function fileCat
#  */
sub fileCat
{
  my $path = join('/', @_);
  $path =~ s/\/\/+/\//g;
  return $path;
}
# /** fileCat(String, String ...) */

# /** @function printUsage
#  */
sub printUsage
{
  my ($message) = @_;
  if (defined $message)
  {
    print STDERR 'Error! ' . $message . "\n";
  }
  print STDERR 'Usage: parallel_terrier_fileindexer.pl -terrier <path> -collection <path> -workers <num> -batchsize <num> [-maxfiles <num>] [-debug]' . "\n\n";
  print '[' . time() . ']Parallel FileIndexer Complete: ' . localtime() . "\n";
  exit(0);
}
# /** printUsage(String) **/

# /** @function main
#  */
sub main
{
  print '[SCRIPT:' . time() . "] Starting Parallel FileIndexer\n";

  # 1. Initialization
  my $class_name = 'org.terrier.applications.FileIndexer';
  my $worker_count = 0;
  my $terrier_home = '';
  my $collection_path = '';
  my $batch_size = 0;
  my $debug = 0;
  my $max_files = 0;
  # - parse arguments
  my $argument;
  for (my $i = 0; $i < scalar(@ARGV); $i++)
  {
    $argument = $ARGV[$i];
    if ('-workers' eq $argument)
    {
      $i++;
      $worker_count = $ARGV[$i];
    }
    elsif ('-terrier' eq $argument)
    {
      $i++;
      $terrier_home = $ARGV[$i];
    }
    elsif ('-collection' eq $argument)
    {
      $i++;
      $collection_path = $ARGV[$i];
    }
    elsif ('-batchsize' eq $argument)
    {
      $i++;
      $batch_size = $ARGV[$i];
    }
    elsif ('-debug' eq $argument)
    {
      $debug = 1;
    }
    elsif ('-maxfiles' eq $argument)
    {
      $i++;
      $max_files = $ARGV[$i];
    }
    else
    {
      &printUsage('Unrecognized argument: ' . $argument);
    }
  }
  print '[SCRIPT] Worker Count: ' . $worker_count . "\n";
  print '[SCRIPT] Terrier Home: ' . $terrier_home . "\n";
  print '[SCRIPT] Collection:   ' . $collection_path . "\n";
  print '[SCRIPT] Batch Size:   ' . $batch_size . "\n";
  print '[SCRIPT] Debug:        ' . $debug . "\n";

  # - check arguments
  if ($worker_count !~ /^\d+$/)
  {
    &printUsage('Worker count must be an integer');
  }
  if ('' eq $terrier_home || !-d $terrier_home)
  {
    &printUsage('Terrier home path given doesn\'t exist or isn\'t a directory');
  }
  if ('' eq $collection_path || !-d $collection_path)
  {
    &printUsage('Collection path given doesn\'t exist or isn\'t a directory');
  }
  if ($batch_size !~ /^\d+$/)
  {
    &printUsage('Batch size count must be an integer');
  }
  if (0 == $worker_count || 0 == $batch_size)
  {
    print STDOUT "Warning! Zero workers or a batch size of zero causes a serial index.\n";
    $batch_size = 0;
  }
  # - derived variables
  my $anyclass_exe = &fileCat($terrier_home, 'bin', 'anyclass.sh');

  # 2. Remove any existing index
  print STDOUT "[SCRIPT] Removing old index files...\n";
  my $var_path = &fileCat($terrier_home, 'var');
  opendir(DH, $var_path) or die('Error! Failed to open var path for reading: ' . $!);
  my @old_files = readdir(DH);
  closedir(DH);
  my $old_file;
  foreach $old_file (@old_files)
  {
    if ($old_file =~ /^manifest-\d+.spec/)
    {
      my $old_path = &fileCat($var_path, $old_file);
      &debugPrint($debug, 'deleting ' . $old_path . "\n");
      unlink($old_path);
    }
    my $index_path = &fileCat($var_path, 'index');
    if (-d $index_path)
    {
      my $delete_command = 'rm -rf "' . $index_path . '"';
      &debugPrint($debug, 'command: ' . $delete_command . "\n");
      `$delete_command`;
    }
    my $assoc_path = &fileCat($terrier_home, 'share', 'images', 'assoc');
    if (-d $assoc_path)
    {
      my $delete_command2  = 'rm -rf "' . $assoc_path . '"';
      &debugPrint($debug, 'command: ' . $delete_command2 . "\n");
      `$delete_command2`;
    }
  }

  # 3. Prepare the collection for parallel indexing
  print STDOUT "[SCRIPT] Prepare collection for indexing...\n";
    my $prepare_command = $anyclass_exe . ' ' . $class_name . ' -prepare -path "' . $collection_path . '"';
    if (0 < $batch_size)
    {
      $prepare_command .= ' -batchsize ' . $batch_size;
    }
    if (0 < $max_files)
    {
      $prepare_command .= ' -maxfiles ' . $max_files;
    }
    &debugPrint($debug, 'command: ' . $prepare_command . "\n");
    `$prepare_command`;
    # - count the number of manifest files generated
    my $manifest_count = 0;
    opendir(DH, $var_path) or die('Error! Failed to open var path for reading: ' . $!);
    my @files = readdir(DH);
    closedir(DH);
    foreach my $file (@files)
    {
      if ($file =~ /^manifest-\d+.spec/)
      {
        $manifest_count++;
      }
    }
    print STDOUT '[SCRIPT] => generated ' . $manifest_count . " manifest files\n";
    if (0 >= $manifest_count)
    {
      die('Error! Failed to generate any manifest files.');
    }

    # 4a. If we only have a single manifest, then we call the indexer directly.
    if (1 == $manifest_count)
    {
      print STDOUT "[SCRIPT] Index collection using serial processing\n";
      my $manifest_path = &fileCat($var_path, 'manifest-000.spec');
      my $index_command = $anyclass_exe . ' ' . $class_name . ' -index -path "' . $manifest_path . '" -prefix 000';
      &debugPrint($debug, 'command: ' . $index_command . "\n");
      `$index_command`;
    }
    # 4b. Call OpenMPI enabled executable to perform parallel processing
    else
    {
      print STDOUT "[SCRIPT] Index collection with parallel processing (" . $worker_count . " workers\n";
      my $mpi_flags = '--show-progress --verbose ';
      # Excessive force! Ensure we bind to the correct network interface
      $mpi_flags .= '--mca btl tcp,sm,self --mca btl_tcp_if_include eth0 ';
      #$mpi_flags .= '-nolocal ';
      my $mpi_conf_path = &fileCat($terrier_home, 'mpi.conf');
      if (-f $mpi_conf_path)
      {
        print STDOUT "(cluster)\n";
        $mpi_flags .= '-machinefile "' . $mpi_conf_path . '" ';
      }
      else
      {
        print STDOUT "(multicore)\n";
      }
      my $mpi_command = 'mpirun ' . $mpi_flags . ' -np ' . ($worker_count + 1) . ' mpiterrierfileindexer "' . $ENV{'GSDLHOME'} . '" "' . $terrier_home . '" ' . $manifest_count;
      &debugPrint($debug, 'command: ' . $mpi_command . "\n");
      `$mpi_command`;
    }

    # 5. Merge the indexes
    # - if we performed a serial process above, then this will just rename the
    #   index files
    print STDOUT "[SCRIPT] Merging Indexes\n";
    my $merge_command = $anyclass_exe . ' ' . $class_name . ' -merge';
    &debugPrint($debug, 'command: ' . $merge_command . "\n");
    `$merge_command`;

    # Complete!
    print '[SCRIPT:' . time() . "] Complete!\n\n";
  }

&main();

1;
