/**
 * Class that uses OpenNLP with the Language Detection Model to determine, with a default
 * or configurable level of confidence, whether text (from a file or stdin) is in Maori or not.
 * Internal functions can be used for detecting any of the 103 languages currently supported by
 * the OpenNLP Language Detection Model.
 * 
 * http://opennlp.apache.org/news/model-langdetect-183.html
 * language detector model: http://opennlp.apache.org/models.html
 *        Pre-trained models for OpenNLP 1.5: http://opennlp.sourceforge.net/models-1.5/
 * Use of Apache OpenNLP in general:
 *   http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
 * Use of OpenNLP for language detection:
 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
 * 
 * This code was based on the information and sample code at the above links and the links dispersed throughout this file.
 * See also the accompanying README file.
 *
 * July 2019
 */

package org.hathitrust.lrl.core;

import java.io.*;
import opennlp.tools.langdetect.*;
import opennlp.tools.util.*;

/**
 * Export LRL_HOME environment variable (short of Low-Resource Language) 
 * to be where you have put your apache OpenNLP 'bin' model(s
 * 
 * Create a folder called "models" within the $LRL_HOME folder, and put the file "langdetect-183.bin" in there
 *    (which is the language detection model zipped up and renamed to .bin extension).
 *
 * Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
 *    maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java
 *
 * To run this program, issue one of the following commands from the "src" folder (the folder containing this java file):
 *
 *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help
 *
 *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file &lt;full/path/to/textfile&gt;
 *
 *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector -
 *       Press enter. This variant of the program expects text to stream in from standard input.
 *       If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn.
 *
 * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath
 * Also has information on how to run this class if it's in a Java package.
 */
public class LanguageTextDetector 
{
	protected static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
	
	/** Configurable: cut off minimum confidence value,
	greater or equal to which determines that the best predicted language is acceptable to user of MaoriTextDetector. */
	public final double MINIMUM_CONFIDENCE;
	
	/** silentMode set to false means MaoriTextDetector won't print helpful messages while running. Set to true to run silently. */
	public final boolean silentMode;

	/** Language Detection Model file for OpenNLP is expected to be at $LRL_HOME/models/langdetect-183.bin */
	private final String LANG_DETECT_MODEL_RELATIVE_PATH = "etc" + File.separator + "opennlp-models" + File.separator + "langdetect-183.bin";

	/**
	 * The LanguageDetectorModel object that will do the actual language detection/prediction for us.
	 * Created once in the constructor, can be used as often as needed thereafter.
	 */
	protected LanguageDetector myCategorizer = null;

	public LanguageTextDetector(boolean silentMode, double min_confidence) throws Exception {
	
		this.silentMode = silentMode;
		this.MINIMUM_CONFIDENCE = min_confidence;
		
		// 1. Check we can find the Language Detect Model file in the correct location (check that $LRL_HOME/models/langdetect-183.bin exists);
		
		String langDetectModelPath = System.getProperty("lrl.home");
		
		if (langDetectModelPath == null) {
			langDetectModelPath = System.getenv("LRL_HOME");
		}
		if(langDetectModelPath == null) {
			throw new Exception("\n\t*** Environment variable LRL_HOME must be set to your Apache OpenNLP installation folder.");
		}	
		langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
		File langDetectModelBinFile = new File(langDetectModelPath);
		if(!langDetectModelBinFile.exists()) {
			throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
					+ "\n\t*** Ensure the $LRL_HOME folder contains a 'models' folder with the model file 'langdetect-183.bin' in it.");
		}


		// 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
		// http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
		// https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
		try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {

			LanguageDetectorModel model = new LanguageDetectorModel(modelIn);

			// http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
			this.myCategorizer = new LanguageDetectorME(model);
		}/*catch(Exception e) {
	    e.printStackTrace();
	    }*/

		// instantiating function should handle critical exceptions. Constructors shouldn't.
	}    

	
	public Language predictLanguage(String text) {
		// Get the most probable language
		Language bestLanguage = myCategorizer.predictLanguage(text);
		return bestLanguage;
	}
	
	
	/** @param langCode is 3 letter language code, ISO 639-2/3 
	 * https://www.loc.gov/standards/iso639-2/php/code_list.php
	 * https://en.wikipedia.org/wiki/ISO_639-3
	 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
	 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
	 */
	public boolean isTextInLanguage(String langCode, String text) {
		// Get the most probable language
		Language bestLanguage = myCategorizer.predictLanguage(text);
		doPrint("Best language: " + bestLanguage.getLang());
		doPrint("Best language confidence: " + bestLanguage.getConfidence());

		return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
	}

	/**
	 * Handle "smaller" textfiles/streams of text read in.
	 * Return value is the same as for isTextInLanguage(String langCode, String text);
	 */
	public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
		// https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file

		StringBuilder text = new StringBuilder();
		String line = null;

		while((line = reader.readLine()) != null) { // readLine removes newline separator
			text.append(line + "\n"); // add back (unix style) line ending
		}
		return isTextInLanguage(langCode, text.toString());
	}


	/**
	 * Rudimentary attempt to deal with very large files.
	 * Return value is the same as for isTextInLanguage(String langCode, String text);
	 */    
	public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
		// https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file

		final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence

		StringBuilder text = new StringBuilder();
		String line = null;

		double cumulativeConfidence = 0;
		int numLoops = 0;

		int i = 0;
		String language = null;

		while((line = reader.readLine()) != null) { // readLine removes newline separator
			text.append(line + "\n"); // add back (unix style) line ending

			i++; // read nth line of numLoop

			if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence

				Language bestLanguage = myCategorizer.predictLanguage(text.toString());
				if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
					doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");			
				}
				language = bestLanguage.getLang();
				cumulativeConfidence += bestLanguage.getConfidence();

				doPrintErr("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")"); 

				// finished analysing language of NUM_LINES of text
				text = new StringBuilder();
				i = 0;
				numLoops++;
			}		
		}

		// process any (remaining) text that was less than n NUM_LINES
		if(!text.toString().equals("")) {
			text.append(line + "\n"); // add back (unix style) line ending		
			i++;

			Language bestLanguage = myCategorizer.predictLanguage(text.toString());

			if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
				doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");			
			}
			language = bestLanguage.getLang();
			cumulativeConfidence += bestLanguage.getConfidence();
			doPrintErr("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
		}


		int totalLinesRead = numLoops * NUM_LINES + i; // not used
		double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES


		return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
	}


	/**
	 * Prints to STDOUT the predicted languages of the input text in order of descending confidence.
	 * UNUSED.
	 */
	public void predictedLanguages(String text) {
		// Get an array with the most probable languages

		Language[] languages = myCategorizer.predictLanguages(text);

		if(languages == null || languages.length <= 0) {
			doPrintErr("No languages predicted for the input text");
		} else {
			for(int i = 0; i < languages.length; i++) {
				doPrint("Language prediction " + i + ": " + languages[i]);
			}
		}

	}

	public void doPrint(String msg) {
		doPrint(this.silentMode, msg);
	}
	public void doPrintErr(String msg) {
		doPrintErr(this.silentMode, msg);
	}

	/********** STATIC METHODS *************/

	public static void doPrint(boolean runSilent, String msg) {
		if(!runSilent) System.out.println(msg);	
	}
	public static void doPrintErr(boolean runSilent, String msg) {
		if(!runSilent) System.err.println(msg);	
	}


}
