#!/usr/bin/perl

print "\n";
print "==================== GDBM Diff ====================\n";
print "Diff two GDBM files to determine if their key value\n";
print "pairs differ in any way. Ignores ordering of pairs.\n";
print "---------------------------------------------------\n";
print "\n";

# 0. Initialize and check arguments
# - we'll store all the key value pairs here
my $data = {};
my $differences = {};
# - read in GDBM file paths from the arguments
if (!defined $ARGV[0] || !-f $ARGV[0])
{
  &printUsage('First GDBM file not specified or isn\'t a file');
}
my $gdbm_one_path = $ARGV[0];
if (!defined $ARGV[1] || !-f $ARGV[1])
{
  &printUsage('Second GDBM file not specified or isn\'t a file');
}
my $gdbm_two_path = $ARGV[1];
print "GDBM 1: " . $gdbm_one_path . "\n";
print "GDBM 2: " . $gdbm_two_path . "\n";
# - also check that GSDLHOME is set and that db2txt is available
if (!defined $ENV{GSDLHOME})
{
  &printUsage('GSDLHOME not set. Please source Greenstone\'s setup.bash first.');
}
print "Found Greenstone enviroment\n";
my $test_result = `db2txt 2>&1`;
if ($test_result !~ /usage\:\s+db2txt\s+database\-name/i)
{
  &printUsage('The program db2txt could not be found. Ensure Greenstone environment is set up properly and that bin/<os>/db2txt exists and it executable.');
}
print "Found application db2txt\n"; 
print "\n";

# 1. Transform the first GDBM file into TXT and then parse in key-value pairs
#    into a hashmap
print " * Read in first GDBM file: " . $gdbm_one_path . "\n";
my $cmd = 'db2txt ' . $gdbm_one_path . ' 2>&1';
my $txt = `$cmd`;
# - parse out each key-value pair and store
while ($txt =~ s/\[([^\]]+)\]\n(.*?)\n\-+\n//s)
{
  my $a_key = $1;
  my $a_value = $2;
  print " - storing key '" . $a_key . "'\n";
  print " - value '" . $a_value . "'\n";
  $data->{$a_key} = $a_value;
}
print "   - read " . scalar(keys(%{$data})) . " pairs\n";
if ($txt =~ /\w/)
{
  print "   - left over txt: |" . $txt . "|\n";
}
print "\n";

# 2. Now we parse the second GDBM file in a similar fashion, except now we
#    compare any keys found to ones in the existing data structure. In the
#    case that the key doesn't exist, then we've found a record in B that
#    is not in A. In the case it does exist we compare the files to see
#    if they are the same.
print " * Read in second GDBM file: " . $gdbm_two_path . "\n";
my $difference_count = 0;
$cmd = 'db2txt ' . $gdbm_two_path . ' 2>&1';
$txt = `$cmd`;
my $b_pair_count = 0;
while ($txt =~ s/\[([^\]]+)\]\n(.*?)\n\-+\n//s)
{
  my $b_key = $1;
  my $b_value = $2;
  print STDERR " - testing key: '" . $b_key . "'\n";
  print STDERR " - value '" . b_value . "'\n";
  if (!defined $data->{$b_key})
  {
    print STDERR " - couldn't find in A\n";
    $differences->{$b_key} = 2;
  }
  elsif ($data->{$b_key} ne $b_value)
  {
    print STDERR " - different value for A\n";
    $differences->{$b_key} = 3; 
  }
  # - no difference. Remove from data structure as we've dealt with this entry
  else
  {
    print STDERR " - the same!\n";
    delete($data->{$b_key});
  }
  $b_pair_count++;
}
print "   - read " . $b_pair_count . " pairs\n";
if ($txt =~ /\w/)
{
  print "   - left over txt: |" . $txt . "|\n";
}
# - now we tackle the final case, that of records left over (hence were found
#   in A but not in B).
foreach my $a_key (keys %{$data})
{
  $differences->{$a_key} = 1;
}
print "\n";

print "Result: ";
my $difference_count = scalar keys %{$differences}; 
if ($difference_count)
{
  print "Found " . $difference_count . " differences.\n\n";
  foreach my $d_key (sort keys %{$differences})
  {
    my $d_value = $differences->{$d_key};
    if ($d_value == 1)
    {
      print "   ! only in gdbm one: " . $d_key . "\n";
    }
    elsif ($d_value == 2)
    {
      print "   ! only in gdbm two: " . $d_key . "\n";
    }
    else
    {
      print "   ! values for key differ: " . $d_key . "\n";
    }
  }
}
else
{
  print "Files match!\n";
}
print "\n";

print "==================== Complete! ====================\n\n";

exit;

sub printUsage
{
  my ($msg) = @_;
  if (defined $msg)
  {
    print "Fatal Error! " . $msg . "\n";
  }
  print "Usage: gdbm-diff.pl <gdbm db path> <gdbm db path>\n\n";
  exit;
}
# /** printUsage() **/
