package org.terrier.applications;

import java.io.BufferedReader;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.HashSet;
import java.util.Iterator;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import org.terrier.indexing.BlockIndexer;
import org.terrier.indexing.BlockSinglePassIndexer;
import org.terrier.indexing.Collection;
import org.terrier.indexing.Indexer;
import org.terrier.indexing.SimpleFileCollection;
import org.terrier.structures.CollectionStatistics;
import org.terrier.structures.Index;
import org.terrier.structures.merging.StructureMerger;
import org.terrier.utility.ApplicationSetup;

/** @class FileIndexer
 *
 *  A simple indexer, based on DesktopTerrier, that is intended to build a
 *  searchable index from the contents of a directory in the file system. This
 *  indexer has been created with parallel processing in mind, in that you can
 *  split a large or computationally complex directory into several batches
 *  (the contents of which are recorded by manifest files) and then process
 *  each batch independently (and in parallel on a multicore machine or a
 *  cluster). Once this is complete you can then merge each pair of indexes
 *  together - eventually generating a final index encompassing all of the
 *  batches parts.
 *
 *  Start by preparing the collection for indexing by locating indexable files
 *  and recording them in one or more manifest files, i.e.:
 *
 *  ./bin/anyclass.sh org.terrier.applications.FileIndexer -prepare \
 *                    -path /data/mycollection/ -batchsize 10
 *
 *  You then build the uniquely named index (in this case with a prefix of
 *  '000') for each manifest, i.e.:
 *
 *  ./bin/anyclass.sh org.terrier.applications.FileIndexer -index \
 *                    -path /terrier/var/manifest-000.spec -prefix 000
 *
 *  If there are more index (for example, with prefixes '000' and '001'), you 
 *  can then merge them (into a single index with the prefix 'data') using this
 *  command:
 *
 *  ./bin/anyclass.sh \
 *         org.terrier.applications.structures.merging.BlockStructureMerger \
 *         /terrier/var/index/ 000 /terrier/var/index/ 001 \
 *         /terrier/var/index/ data
 *
 */
public class FileIndexer
{
  private String index_prefix;

  private HashSet<String> supported_extensions;

  private List<String> file_list;

  protected static final Logger logger = Logger.getLogger(FileIndexer.class);

  /** @function FileIndexer
   *  Default constructor
   */
  public FileIndexer(String index_prefix)
  {
    ///ogger.info("FileIndexer::FileIndexer(" + index_prefix + ")");
    this.index_prefix = index_prefix;
    this.supported_extensions = new HashSet<String>();
    this.file_list = new ArrayList<String>();

    //setting properties for the application
    if ((ApplicationSetup.getProperty("indexer.meta.forward.keys", null)) == null)
    {
      ApplicationSetup.setProperty("indexer.meta.forward.keys","docno,filename");
      ApplicationSetup.setProperty("indexer.meta.forward.keylens","26,2048");
    }
    ApplicationSetup.setProperty("indexing.max.tokens", "10000");
    ApplicationSetup.setProperty("invertedfile.processterms","25000");
    ApplicationSetup.setProperty("ignore.low.idf.terms","false");
    ApplicationSetup.setProperty("matching.dsms", "BooleanFallback");

    // we need to know what extensions are supported by the indexer
    String extension_parsers_raw = ApplicationSetup.getProperty("indexing.simplefilecollection.extensionsparsers","txt:FileDocument");
    // parse up the string, and store the extensions (only) in the HashSet
    if (extension_parsers_raw.length() > 0)
    {
      String[] extension_parsers = extension_parsers_raw.split("\\s*,\\s*");
      // for each parser, of the form:  <ext>:<documentclass>
      for (int i = 0; i < extension_parsers.length; i++)
      {
        String extension_parser = extension_parsers[i];
        String[] mapping = extension_parser.split(":");
        // a well-formed mapping has an extension and a document class
        if (mapping.length == 2)
        {
          // we store just the extension, leaving the actual mapping up to
          // SimpleFileCollection
          supported_extensions.add(mapping[0]);
        }
        else
        {
          logger.warn("Malformed extension parser mapping: " + extension_parser);
        }
      }
    }
    else
    {
      logger.error("No extension parsers defined in Terrier's properties - SimpleFileCollection unusable");
    }
  }
  /** FileIndexer() **/

  /** @function canIndex
   *  Given a path to a file, determine if Terrier's current configuration
   *  allows that file to be indexed
   */
  public boolean canIndex(String file_path)
  {
    // we may have specified a default Document class for all files encountered
    if (!ApplicationSetup.getProperty("indexing.simplefilecollection.defaultparser","").equals(""))
    {
      return true;
    }
    // otherwise, check through the list of supported file extensions to see
    // if one matches this file
    // - get the file's extension
    String file_extension = file_path.substring(file_path.lastIndexOf(".") + 1);
    // - see if it exists in the array of supported extensions
    boolean result = this.supported_extensions.contains(file_extension);
    logger.info("[P1] Can index \"" + file_path.toString() + "\"? => " + result);
    return result;
  }
  /** canIndex(String) **/

  /** @function close
   */
  public void close()
  {
    ///ogger.info("FileIndexer::close()");
    this.file_list.clear();
    this.file_list = null;
    this.supported_extensions.clear();
    this.supported_extensions = null;
  }
  /** close() **/

  /** @function deleteIndex
   */
  public void deleteIndex(String prefix, String logging_name)
  {
    logger.info("[" + logging_name + "] Delete index " + prefix);
    // and that all old files for this index are removed
    File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
    File files[] = index_path.listFiles();
    for (int i = 0; i < files.length; i++)
    {
      String file_name = files[i].getName();
      if (file_name.startsWith(prefix))
      {
        files[i].delete();
      }
    }
  }
  /** deleteIndex(String, String) **/

  /** @function epochTime
   *  Returns the current time in seconds since 1970JAN01
   */
  public long epochTime()
  {
    return System.currentTimeMillis()/1000;
  }
  /** epochTime() **/

  /** @function listIndexes
   */
  public ArrayDeque<String> listIndexes(boolean include_default)
  {
    ///ogger.info("FileIndexer::listIndexes(" + include_default + ")");
    File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
    String default_index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
    // we start by populating a set structure to ensure each prefix only occurs
    // once
    HashSet<String> index_parts_set = new HashSet<String>();
    File files[] = index_path.listFiles();
    if (files != null)
    {
      for (int i = 0; i < files.length; i++)
      {
        String file_name = files[i].getName();
        if (include_default || !file_name.startsWith(default_index_prefix))
        {
          String prefix = file_name.substring(0, file_name.indexOf("."));
          index_parts_set.add(prefix);
        }
      }
    }
    else
    {
      logger.error("[P3] Error! No indexes found - did indexing fail?");
    }
    // we then turn the hashset into a nice list (in this case a deque)
    ArrayDeque<String> index_parts = new ArrayDeque<String>();
    Iterator<String> index_parts_iterator = index_parts_set.iterator();
    while (index_parts_iterator.hasNext())
    {
      index_parts.add(index_parts_iterator.next());
    }
    return index_parts;
  }
  /** listIndexes(boolean) **/

  /**
   */
  public void loadManifest(String index_prefix, Path manifest_path)
  {
    logger.info("[B" + index_prefix + "] Load manifest " + manifest_path.toString());
    try
    {
      BufferedReader manifest_reader = new BufferedReader(new InputStreamReader(new FileInputStream(manifest_path.toFile())));
      String line = "";
      while ((line = manifest_reader.readLine()) != null)
      {
        this.file_list.add(line);
      }
      manifest_reader.close();
      manifest_reader = null;
    }
    catch (IOException e)
    {
      logger.error("Exception when reading manifest! " + e);
    }
  }
  /** loadManifest(Path) **/

  /** @function mergeIndexes
   */
  public boolean mergeIndexes(String prefix_one, String prefix_two)
  {
    ///ogger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ")");
    String prefix_default = ApplicationSetup.getProperty("terrier.index.prefix", "data");
    return this.mergeIndexes(prefix_one, prefix_two, prefix_default);
  }
  /** mergeIndexes(String, String) **/

  /** @function mergeIndexes
   */
  public boolean mergeIndexes(String prefix_one, String prefix_two, String prefix_out)
  {
    logger.info("[P3] Merge indexes " + prefix_one + " and " + prefix_two + " => " + prefix_out);
    // init
    String index_path = ApplicationSetup.TERRIER_INDEX_PATH;
    // use StructureMerger class
    Index.setIndexLoadingProfileAsRetrieval(false);
    Index index_one = Index.createIndex(index_path, prefix_one);
    Index index_two = Index.createIndex(index_path, prefix_two);
    Index index_out = Index.createNewIndex(index_path, prefix_out);
    StructureMerger structure_merger = new StructureMerger(index_one, index_two, index_out);

    // quiet logger
    Logger root_logger = Logger.getRootLogger();
    Level log_level = root_logger.getLevel();
    root_logger.setLevel((Level) Level.OFF);
    structure_merger.mergeStructures();
    structure_merger = null;
    root_logger.setLevel(log_level);

    // Only print out statistics for 'data'
    if (prefix_out.equals("data"))
    {
      CollectionStatistics collection_statistics = index_out.getCollectionStatistics();
      logger.info("[P3] Number of Documents: " + collection_statistics.getNumberOfDocuments());
      logger.info("[P3] Number of Tokens: " + collection_statistics.getNumberOfTokens());
      logger.info("[P3] Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms());
      logger.info("[P3] Number of Pointers: " + collection_statistics.getNumberOfPointers());
      collection_statistics = null;
    }

    // Cleanup and closedown
    try
    {
      index_one.close();
      index_two.close();
      index_out.close();
    }
    catch (IOException e)
    {
      logger.error("Exception while closing indexes: ", e);
      return false;
    }
    index_one = null;
    index_two = null;
    index_out = null;
    this.deleteIndex(prefix_one, "P3");
    this.deleteIndex(prefix_two, "P3");
    prefix_one = null;
    prefix_two = null;
    return true;
  }
  /** mergeIndexes(String, String, String) **/

  /** @function renameIndex
   */
  public void renameIndex(String prefix_in)
  {
    ///ogger.info("FileIndexer::renameIndex(" + prefix_in + ")");
    String default_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
    this.renameIndex(prefix_in, default_prefix);
  }
  /** renameIndex(String) **/

  /** @function renameIndex
   */
  public void renameIndex(String prefix_in, String prefix_out)
  {
    logger.info("[P3] Rename index " + prefix_in + " => " + prefix_out);
    prefix_in = prefix_in + ".";
    File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
    File files[] = index_path.listFiles();
    for (int i = 0; i < files.length; i++)
    {
      File a_file = files[i];
      String file_name = a_file.getName();
      if (file_name.startsWith(prefix_in))
      {
        String suffix = file_name.substring(file_name.indexOf("."));
        Path target_path = Paths.get(ApplicationSetup.TERRIER_INDEX_PATH, prefix_out + suffix);
        try
        {
          Files.move(a_file.toPath(), target_path);
        }
        catch (IOException e)
        {
          System.err.println("Error! Failed to rename file: " + e);
        }
        target_path = null;
        suffix = null;
      }
      file_name = null;
      a_file = null;
    }
    files = null;
    index_path = null;
  }
  /** renameIndex(String, String) **/

  /** @function runIndex
   */
  public void runIndex(String batch_number)
  {
    logger.info("[B" + batch_number + "] create index");
    if (this.file_list == null || this.file_list.size() == 0)
    {
      logger.error("No files specified to index. Aborting indexing process.");
      return;
    }

    try
    {
      // ensure the index directory exists
      File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
      if (!index_path.exists() && !index_path.mkdirs())
      {
        logger.error("Could not create the index folders at: "+ index_path);
        logger.error("Aborting indexing process");
        return;
      }
      this.deleteIndex(this.index_prefix, "F");
      // create the appropriate indexer
      Indexer indexer;
      final boolean use_single_pass = Boolean.parseBoolean(ApplicationSetup.getProperty("desktop.indexing.singlepass", "false"));
      logger.warn("BLOCK_INDEXING hardcoded default for SimpleFileCollection");
      if (use_single_pass)
      {
        indexer = new BlockSinglePassIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
      }
      else
      {
        indexer = new BlockIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
      }
      // create the simple file collection object and hand it to the indexer
      // for indexing
      SimpleFileCollection sfc = new SimpleFileCollection(this.file_list, false);
      indexer.index(new Collection[] { sfc });
      Index the_index = Index.createIndex(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
      // output some indexing statistics
      if (the_index != null)
      {
        CollectionStatistics collection_statistics = the_index.getCollectionStatistics();
        logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments());
        logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens());
        logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms());
        logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers());
        collection_statistics = null;
      }
      else
      {
        logger.warn("Nothing indexed!");
      }
    }
    catch(Exception e)
    {
      logger.error("An unexpected exception occured while indexing. Indexing has been aborted.",e);
    }
    logger.info("FileIndexer::runIndex() - Complete!");
  }
  /** runIndex(List<String>) **/

  /** @function main
   */
  public static void main(String[] args)
  {
    System.out.println("================================= FileIndexer =================================");

    // 0. Initialization
    final int PREPARE = 0;
    final int INDEX   = 1;
    final int MERGE   = 2;
    int index_mode = PREPARE;
    Path a_path = null;
    String index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
    int batch_size = 0; // No limit
    int max_files = 0; // No limit
    String divider = "===============================================================================";

    // 1. Parse arguments
    // - first argument is the mode (prepare|index|merge)
    if (args.length < 1)
    {
      System.out.println("Usage: <FileIndexer> -prepare -path <path> [-batchsize <int>] [-maxfiles <int>]");
      System.out.println("       <FileIndexer> -index -path <path> [-prefix <str>]");
      System.out.println("       <FileIndexer> -merge");
      System.out.println("where: paths *must* be absolute");
      System.out.println("       when preparing 'path' is to the collection directory");
      System.out.println("       when indexing 'path' is to a manifest file (generated by prepare)");
      System.out.println("       prefix is 'data' by default");
      System.out.println("       batch_size is unlimited by default");
      System.exit(0);
    }
    if (args[0].equals("-index"))
    {
      index_mode = INDEX;
    }
    else if (args[0].equals("-merge"))
    {
      index_mode = MERGE;
    }
    // - all other arguments
    for (int argc = 1; (argc + 1) < args.length; argc += 2)
    {
      String key = args[argc];
      String value = args[argc + 1];
      if (key.equals("-batchsize"))
      {
        batch_size = Integer.parseInt(value);
      }
      else if (key.equals("-maxfiles"))
      {
        max_files = Integer.parseInt(value);
      }
      else if (key.equals("-path"))
      {
        a_path = Paths.get(value);
      }
      else if (key.equals("-prefix"))
      {
        index_prefix = value;
      }
      else
      {
        System.err.println("Warning! Unknown argument: " + key);
      }
    }
    // - check arguments
    if (index_mode != MERGE && (a_path == null || !Files.exists(a_path)))
    {
      System.err.println("Error! Required argument -path not set or invalid.");
      System.err.println("");
      System.exit(0);
    }

    // 2. Create the file indexer
    FileIndexer file_indexer = new FileIndexer(index_prefix);

    long epoch_time;

    switch(index_mode)
    {
    case PREPARE:
      logger.info("[P1:" + file_indexer.epochTime() + "] Starting manifest preparation");
      // 3a. If we are in preparation mode, this is where we go through the
      //     files in the search path, adding those that are indexable by
      //     Terrier's current configuration into one of more manifest files
      //     based upon the value of batch_size.
      System.out.println("Mode: Preparation");
      System.out.println("Collection Path: " + a_path.toString());
      if (batch_size != 0)
      {
        System.out.println("Batch Size: " + batch_size);
      }
      if (max_files != 0)
      {
        System.out.println("Max Files: " + max_files);
      }
      System.out.println(divider);
      //    - we're going to perform a breadth-first recursive search for files
      //      using the canIndex() function of the indexer to determine if this
      //      is (or would be) a file to index
      ArrayDeque<Path> search_paths = new ArrayDeque<Path>();
      search_paths.add(a_path);
      int file_count = 0;
      int total_file_count = 0; // Used in conjunction with max_files
      int manifest_count = 0;
      boolean have_max_files = false;
      PrintWriter manifest_writer = null;
      while (!search_paths.isEmpty() && !have_max_files)
      {
        Path search_path = search_paths.remove();
        File files_raw[] = search_path.toFile().listFiles();
        // randomize file order
        List<File> files = Arrays.asList(files_raw);
        Collections.shuffle(files);
        // iterate through files filling manifests
        for (int i = 0; i < files.size() && !have_max_files; i++)
        {
          Path file_path = files.get(i).toPath();
          String file_name = file_path.getFileName().toString();
          // skip . and ..
          if (file_name.equals(".") || file_name.equals(".."))
          {
            // do nothing
          }
          // if this is a directory, add it to the queue of paths to search
          else if (Files.isDirectory(file_path))
          {
            search_paths.add(file_path);
          }
          else
          {
            // would we expect to index this file?
            if (file_indexer.canIndex(file_name))
            {
              // Ensure we opened a file handle for writing this manifest
              if (manifest_writer == null)
              {
                Path manifest_path = Paths.get(ApplicationSetup.TERRIER_VAR, "manifest-" + String.format("%03d", manifest_count) + ".spec");
                try
                {
                  manifest_writer = new PrintWriter(new BufferedOutputStream(new FileOutputStream(manifest_path.toFile())));
                }
                catch (IOException e)
                {
                  System.err.println("Error when opening manifest for writing! " + e);
                }
                System.out.println("* Writing to manifest: " + manifest_path.toString());
              }
              manifest_writer.println(file_path.toString());
              file_count++;
              total_file_count++;
              // if we have a batch size, and we've found enough files...
              if (manifest_writer != null && batch_size > 0 && file_count >= batch_size)
              {
                manifest_writer.close();
                manifest_writer = null;
                manifest_count++;
                file_count = 0;
              }
              // if we have a max files limit and we've done enough files, then
              // break out of for and while loops
              if (max_files > 0 && total_file_count >= max_files)
              {
                have_max_files = true;
              }
            }
          }
        }
      }
      if (manifest_writer != null)
      {
        manifest_writer.close();
        manifest_writer = null;
      }
      search_paths.clear();
      search_paths = null;
      logger.info("[P1:" + file_indexer.epochTime() + "] Complete");
      break;

    case INDEX:
      // 3b. Load the manifest specified by the search path and index files
      String manifest_path = a_path.toString();
      System.out.println("Mode:     Index");
      System.out.println("Prefix:   " + index_prefix);
      System.out.println("Manifest: " + manifest_path);
      System.out.println("Indexing: " + ApplicationSetup.getProperty("indexer.meta.forward.keys", "docno,filename"));
      System.out.println(divider);

      logger.info("[B" + index_prefix + ":" + file_indexer.epochTime() + "] Starting");
      file_indexer.loadManifest(index_prefix, a_path);
      file_indexer.runIndex(index_prefix);
      logger.info("[B" + index_prefix + ":" + file_indexer.epochTime() + "] Complete");
      break;

    case MERGE:
      logger.info("[P3:" + file_indexer.epochTime() + "] Starting index merging");
      System.out.println("Mode: Merge");
      System.out.println(divider);
      // 3c. User has requested a merging process.
      // Look in the default index path and locate all of the indexes to merge
      // (all those not prefixed 'data')
      ArrayDeque<String> index_parts = file_indexer.listIndexes(false);
      // We merge the indexes two at a time, creating new intermediate indexes,
      // while there are still more than two indexes left in the queue. We
      // delete indexes that we have already merged.
      int intermediate_file_counter = 0;
      while (index_parts.size() > 2)
      {
        String index_part_one_prefix = index_parts.remove();
        String index_part_two_prefix = index_parts.remove();
        String intermediate_index_prefix = "temp" + String.format("%03d", intermediate_file_counter);
        ///ogger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into index \"" + intermediate_index_prefix + "\"");
        file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix, intermediate_index_prefix);
        index_part_one_prefix = null;
        index_part_two_prefix = null;
        // - add the intermediate index to the list of indexes to be merged
        index_parts.add(intermediate_index_prefix);
        // - and increment the intermediate file count for the next iteration
        intermediate_file_counter++;
      }
      // If there are two left we merge them into the final index
      if (index_parts.size() == 2)
      {
        String index_part_one_prefix = index_parts.remove();
        String index_part_two_prefix = index_parts.remove();
        ///ogger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into final index");
        file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix);
        index_part_one_prefix = null;
        index_part_two_prefix = null;
      }
      // Otherwise we just rename the files that are there into the final
      // index
      else if (index_parts.size() == 1)
      {
        String index_part_prefix = index_parts.remove();
        ///ogger.info("Renaming index \"" + index_part_prefix + "\" into final index");
        file_indexer.renameIndex(index_part_prefix);
        index_part_prefix = null;
      }
      // no index parts detected - was merge run by mistake?
      else
      {
        logger.warn("[P3] Warning! No index parts detected - merge has no effect");
      }
      // clean up
      index_parts.clear();
      index_parts = null;
      logger.info("[P3:" + file_indexer.epochTime() + "] Complete");
      break;

    default:
      logger.warn("Unknown FileIndexer mode requested");
    }

    // 4. Finish indexing
    file_indexer.close();
    file_indexer = null;

    // 5. Complete!
    System.out.println(divider);
    System.out.println("Complete!");
    System.out.println(divider);

    System.out.println("");
  }
  /** main() **/
}