#!/usr/bin/perl

# Given a large Greenstone import directory, create a random subset of that
# import collection with a specific document count. Uses symlinking so won't
# work well under windows.
# jmt12

use strict;
use warnings;

if (!defined $ARGV[0] || !-d $ARGV[0] || !defined $ARGV[1] || $ARGV[1] !~ /^\d+$/)
{
  print "usage: importsubsetinator.pl <import directory> <max number of documents>\n";
  exit(0);
}

my $import_dir = $ARGV[0];
my $max_docs = $ARGV[1];

my $subset_dir = 'import-' . $max_docs;
mkdir($subset_dir, 0755);

# 1. While we haven't reached our target
print "Processing";
my $current_docs = 0;
while ($current_docs < $max_docs)
{
  # 2. Find a random document
  my $path = &pickRandomDoc($import_dir);
  my $path_suffix = substr($path, length($import_dir) + 1);
  # 3. Check we don't have it already
  my $target_path = './' . $subset_dir . '/' . $path_suffix;
  if (-f $target_path)
  {
    next;
  }
  # 4. Symlink it into the subset directory
  &recursiveMkdir($subset_dir, $target_path);
  my $cmd = "ln -s $path $target_path";
  `$cmd`;
  print ".";
  # 5. Repeat until complete
  $current_docs++;
  if ($current_docs % 10000 == 0)
  {
    print '[' . $current_docs . "]\n";
  }
}
print '[' . $current_docs . "]\n";
print "Complete!\n";
exit;

sub pickRandomDoc
{
  my ($dir) = @_;

  if (!opendir(DH, $dir))
  {
    die ("Failed to open import directory for reading!\n");
  }
  # get the files in this dir, but skip anything starting with a fullstop
  my @files = grep {!/^\./} readdir(DH);
  my $file = @files[int(rand(scalar(@files)))];
  # found a directory or a file
  my $path = $dir . '/' . $file;
  # descend into directories
  if (-d $path)
  {
    return &pickRandomDoc($path);
  }
  # return the file
  else
  {
    return $path;
  }
}

sub recursiveMkdir
{
  my ($subset_dir, $full_path) = @_;
  my $test_path = $subset_dir;
  # extract just the juicy part of the path
  if ($full_path =~ /import-\d+\/(.+)\/[^\/]+\.txt/)
  {
    my $dirs = $1;
    my @dir_parts = split(/\//, $dirs);
    foreach my $dir (@dir_parts)
    {
      $test_path .= '/' . $dir;
      if (!-d $test_path)
      {
        mkdir($test_path, 0755);
      }
    }
  }
}

1;
