#!/usr/bin/perl

# Pragma
use strict;
use warnings;

# Modules
use File::Path qw(make_path);
use POSIX qw(strftime);

# Requires setup.bash to have been sourced
BEGIN
{
  die "GSDLHOME not set\n" unless (defined $ENV{'GSDLHOME'} && $ENV{'GSDLHOME'} ne '');
  die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
  die "GEXTPARALLELBUILDING not set\n" unless defined $ENV{'GEXTPARALLELBUILDING'};
  die "GEXTPARALLELBUILDING_INSTALLED not set\n" unless defined $ENV{'GEXTPARALLELBUILDING_INSTALLED'};
  die "HDFS HOST not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSHOST'};
  die "HDFS PORT not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSPORT'};
}

if (!defined $ARGV[0])
{
  &printUsage('Missing collection name');
}
my $collection = $ARGV[0];
if (!defined $ARGV[1] || $ARGV[1] !~ /^\d+$/)
{
  &printUsage('Missing max replication factor or NAN');
}
my $max_replication_factor = $ARGV[1];
if (!defined $ARGV[2] || $ARGV[2] !~ /^\d+$/)
{
  &printUsage('Missing iterations or NAN');
}
my $iterations = $ARGV[2];

# 1. Initialization
my $dry_run = 0;
my $debug = 0;
my $user_name = `id -u -n`;
chomp($user_name);
my $machine_name = `hostname -s`;
chomp($machine_name);
$machine_name = ucfirst($machine_name);
my $os_name = `lsb_release -i`;
$os_name =~ s/^Distributor ID:\s+(.*)\r?\n$/$1/i;
my $db_path_suffix = $machine_name . '_' . $os_name . '_hadoop_hdfsshell_54_' . $max_replication_factor . '_1_i' . $iterations;
my $test_dir = $ENV{'GSDLHOME'} . '/collect/' . $collection . '/results/' . $db_path_suffix;
if (!-d $test_dir)
{
  make_path($test_dir);
}
my $db_path = $test_dir . '/test.db';

# 2. Create and populate testing database as necessary
my $init_database = 0;
if (!-f $db_path)
{
  $init_database = 1;
}
elsif('0' eq sqliteGetValue($db_path, 'SELECT COUNT(*) FROM tests'))
{
  $init_database = 1;
}
# - do we need to create database?
if ($init_database > 0)
{
  print STDOUT " * Creating database tables\n";
  # create tests table
  sqliteExec($db_path, 'CREATE TABLE IF NOT EXISTS tests (replication INTEGER, iteration INTEGER, timestamp INTEGER DEFAULT 0, realtime REAL DEFAULT 0, systime REAL DEFAULT 0, usertime REAL DEFAULT 0, PRIMARY KEY (replication, iteration))');
  sqliteExec($db_path, 'CREATE TABLE IF NOT EXISTS testoutput (replication INTEGER, iteration INTEGER,  output TEXT, PRIMARY KEY (replication, iteration))');
  # populate with tests
  print STDOUT " * Populating tests table\n";
  for (my $replication = 1; $replication <= $max_replication_factor; $replication++)
  {
    for (my $iteration = 1; $iteration <= $iterations; $iteration++)
    {
      sqliteExec($db_path, 'INSERT INTO tests (replication, iteration) VALUES (' . $replication . ',' . $iteration . ')');
      sqliteExec($db_path, 'INSERT INTO testoutput (replication, iteration) VALUES (' . $replication . ',' . $iteration . ')');
    }
  }
}

# 3. Load random test and run it
my $total_count = sqliteGetValue($db_path, 'SELECT COUNT(*) FROM tests');
my $test_count = sqliteGetValue($db_path, 'SELECT COUNT(*) FROM tests WHERE realtime=0');
my $exit_file_path = $ENV{'GSDLHOME'} . '/collect/exit.now';
while ($total_count > 0 && $test_count > 0 && !-f $exit_file_path)
{
  my $x = $total_count - $test_count;
  my $timestamp = time();
  my $now_string = strftime("%a %b %e %H:%M:%S %Y", localtime($timestamp));
  print STDOUT ' * [' . $now_string . '] Progress: ' . sprintf("%.0f",(($x/$total_count)*100)) . '% complete! [' . $test_count . " tests remaining]\n";

  # 4. Pick a random test (thread count and epoch) and run and time it
  my ($replication, $iteration) = sqliteGetValues($db_path, 'SELECT replication, iteration FROM tests WHERE realtime=0 ORDER BY RANDOM() LIMIT 1');
  print STDOUT '   - running test hadoop import for collection=' . $collection . ', replication=' . $replication . ', iteration=' . $iteration . "\n";

  # 5. Change the HDFS replication to match
  print STDOUT '   - rebalance HDFS with replication: ' . $replication . "\n";
  my $hdfs_cmd = 'hadoop fs -setrep -w ' . $replication . ' -R /user/' . $user_name . '/gsdl/collect/' . $collection . '/import 2>&1';
  print STDOUT '[DEBUG] command: |' . $hdfs_cmd . "|\n" unless !$debug;
  if (!$dry_run)
  {
    my $result = `$hdfs_cmd`;
    print STDOUT '[DEBUG] result: |' . $result . "|\n" unless !$debug;
  }

  # 6. Now call hadoop_import.pl but pass in some extra options to control
  # where logs get written
  print STDOUT "   - ingest using Hadoop\n";
  my $import_cmd = '(time -p hadoop_import.pl "' . $collection . '" -logdir "' . $test_dir . '/' . $timestamp . '") 2>&1';
  print STDOUT '[DEBUG] command: |' . $import_cmd . "|\n" unless !$debug;
  if ($dry_run)
  {
    sqliteExec($db_path, 'UPDATE tests SET realtime=1 WHERE replication=' . $replication . ' AND iteration=' . $iteration);
  }
  else
  {
    my $result = `$import_cmd`;
    my $rtime = 0;
    if ($result =~ /real\s+(\d+\.\d+)/)
    {
      $rtime = $1;
    }
    my $utime = 0;
    if ($result =~ /user\s+(\d+\.\d+)/)
    {
      $utime = $1;
    }
    my $stime = 0;
    if ($result =~ /sys\s+(\d+\.\d+)/)
    {
      $stime = $1;
    }
    $result =~ s/'/&apos;/g;
    $result =~ s/"/&quot;/g;
    $result =~ s/`/&#96;/g;
    print STDOUT '[DEBUG] result: |' . $result . "|\n" unless !$debug;
    # 7. Write results to database
    sqliteExec($db_path, 'UPDATE tests SET timestamp=' . $timestamp . ', realtime=' . $rtime . ', usertime=' . $utime . ', systime=' . $stime . ' WHERE replication=' . $replication . ' AND iteration=' . $iteration);
    sqliteExec($db_path, "UPDATE testoutput SET output='" . $result . "' WHERE replication=" . $replication . " AND iteration=" . $iteration);
  }

  # Repeat until we have exhausted pending tests
  $test_count = sqliteGetValue($db_path, 'SELECT COUNT(*) FROM tests WHERE realtime=0');
}

# 8. Done.
if (-f $exit_file_path)
{
  print STDOUT "   - Removing exit file... ";
  unlink($exit_file_path);
  print STDOUT "Done!\n";
}
print STDOUT "Complete!\n\n";
exit 0;


## @function sqliteExec()
#
sub sqliteExec
{
  my ($db_path, $sql) = @_;
  # call sqliteGetValue() but don't care about result
  &sqliteGetValue($db_path, $sql);
}
## sqliteExec() ##


## @function sqliteGetValues()
#
sub sqliteGetValues
{
  my ($db_path, $sql) = @_;
  if ($sql !~ /LIMIT 1/i)
  {
    $sql .= ' LIMIT 1';
  }
  my $value = sqliteGetValue($db_path, $sql);
  return split(/\|/,$value);
}
## sqliteGetValues() ##


## @function sqliteGetValue()
#
sub sqliteGetValue
{
  my ($db_path, $sql) = @_;
  my $result = `sqlite3 "$db_path" "$sql" 2>&1`;
  if ($result =~ /Error:/)
  {
    die("Fatal Error!\nSQL:" . $sql . "\nMsg:" . $result);
  }
  # trim
  $result =~ s/^\s*|\s*$//g;
  return $result;
}
## sqliteGetValue() ##


## @function printUsage()
#
sub printUsage
{
  my ($msg) = @_;
  # flush STDOUT
  select((select(STDOUT), $|=1)[0]);
  print STDOUT '';
  select((select(STDOUT), $|=0)[0]);
  # output any error message
  if (defined $msg)
  {
    print STDERR 'Error! ' . $msg . "\n";
  }
  # and finally the usage
  print STDERR "Usage: replication_tests.pl <str:collection> <int:max replication> <int:iterations>\n";
  print STDERR "\n";
  exit;
}
## printUsage() ##

1;
