package org.hathitrust.lrl.vollevel;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;

import org.apache.commons.lang3.StringUtils;
import org.hathitrust.lrl.core.LanguageTextDetector;
import org.hathitrust.lrl.core.UtilIO;

import opennlp.tools.langdetect.Language;

public class OpenNLPLanguageVolumeClassification {

	protected static String MAORI_3LETTER_CODE = "mri";
	protected static double minConfidence = 0.9;
	
	protected static boolean DEBUG = false;
	
	protected static LanguageTextDetector maoriTextDetector;
	
	protected static int id_count = 0;
	
	public static void ClassifyPerPageLRL(Path input_path, StringBuffer csv_buffer)
	{
		String input_filename = input_path.toString();

		if (input_filename.endsWith(".txt")) {
			
			id_count++;
			
			if ((DEBUG) && (id_count>=10)) {
				if (id_count==10) {
					System.out.flush();
					System.err.println("*** Cutting CVS output short");
				}
				return;
			}
			
			Path just_filename = input_path.getFileName();
			String vol_id_filesafe = just_filename.toString().replaceAll("\\.[^\\.]+$", "");
			String vol_id  = vol_id_filesafe.replaceAll("%2F", "/").replaceAll("%3A", ":");

			try {
				
				System.out.println("Classifying Volume (per page) #" + id_count + ": " + input_filename);

				String full_text = UtilIO.readTextFile(input_filename);

				String[] text_pages = full_text.split("\\n\\n");
	
				int page_num = 1;
				
				for (String text: text_pages) {
			
					ArrayList<String> csv_items = new ArrayList<String>();

					String page_id = String.format("%06d", page_num);
					String volpage_id = vol_id + "-seq-" + page_id;

					// guard against it being all white-space
					if (!StringUtils.isBlank(text)) {
					
						Language opennlp_predicted_lang = maoriTextDetector.predictLanguage(text);
						String lang = opennlp_predicted_lang.getLang();
						Double conf = opennlp_predicted_lang.getConfidence();

						Boolean is_maori = (lang.equals(MAORI_3LETTER_CODE) && conf >= minConfidence);

						System.out.println("  Best language: " + lang + ", confidence = " + conf);
						
						csv_items.add(volpage_id);
						csv_items.add(vol_id);
						csv_items.add(page_id);
						csv_items.add(lang);
						csv_items.add(conf.toString());
						csv_items.add(is_maori.toString());
					}
					else {
						System.err.println("Warning: Volume-page id '"+volpage_id+"' contains no text");
						
						csv_items.add(volpage_id);
						csv_items.add(vol_id);
						csv_items.add(page_id);
						csv_items.add("undefined");
						csv_items.add("0.0");
						csv_items.add("UNDEFINED");
					}

					String csv_items_line = String.join(",", csv_items);
					csv_buffer.append(csv_items_line+"\n");				
				
					page_num++;
				}
				
			}
			catch (Exception e) {
				e.printStackTrace();
			}
		}
	}
	
	public static void ClassifyVolumeLRL(Path input_path, StringBuffer csv_buffer)
	{
		String input_filename = input_path.toString();

		if (input_filename.endsWith(".txt")) {
			
			id_count++;
			
			if ((DEBUG) && (id_count>=10)) {
				if (id_count==10) {
					System.out.flush();
					System.err.println("*** Cutting CVS output short");
				}
				return;
			}
			
			Path just_filename = input_path.getFileName();
			String vol_id_filesafe = just_filename.toString().replaceAll("\\.[^\\.]+$", "");
			String vol_id  = vol_id_filesafe.replaceAll("%2F", "/").replaceAll("%3A", ":");

			try {
				
				System.out.println("Classifying Volume #" + id_count + ": " + input_filename);

				String full_text = UtilIO.readTextFile(input_filename);

				ArrayList<String> csv_items = new ArrayList<String>();


				// guard against it being all white-space
				if (!StringUtils.isBlank(full_text)) {

					Language opennlp_predicted_lang = maoriTextDetector.predictLanguage(full_text);
					String lang = opennlp_predicted_lang.getLang();
					Double conf = opennlp_predicted_lang.getConfidence();

					Boolean is_maori = (lang.equals(MAORI_3LETTER_CODE) && conf >= minConfidence);

					System.out.println("  Best language: " + lang + ", confidence = " + conf);

					csv_items.add(vol_id);
					csv_items.add(lang);
					csv_items.add(conf.toString());
					csv_items.add(is_maori.toString());
				}
				else {
					System.err.println("Warning: Volume id '"+vol_id+"' contains no text");

					csv_items.add(vol_id);
					csv_items.add("undefined");
					csv_items.add("0.0");
					csv_items.add("UNDEFINED");
				}

				String csv_items_line = String.join(",", csv_items);
				csv_buffer.append(csv_items_line+"\n");				
				
				
			}
			catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	public static void perPage(String volume_text_input_base_dir, String data_output_base_dir)
	{
		String cvs_classification_filename = data_output_base_dir + File.separator + "maori-volume-per_page-classification-opennlp.csv";
		StringBuffer csv_buffer = new StringBuffer();
		
		ArrayList<String> csv_items = new ArrayList<String>();
		csv_items.add("volpage-id");
		csv_items.add("vol-id");
		csv_items.add("page-id");
		csv_items.add("predicted-lang");
		csv_items.add("confidence");
		csv_items.add("is-moari");
		
		String csv_items_line = String.join(",", csv_items);
		csv_buffer.append(csv_items_line+"\n");
		
		try {
			maoriTextDetector = new LanguageTextDetector(false, minConfidence); // runSilent=false
	
			Files.walk(Paths.get(volume_text_input_base_dir))
			.filter(Files::isRegularFile)
			.forEach(item -> OpenNLPLanguageVolumeClassification.ClassifyPerPageLRL(item, csv_buffer) );
			
		}
		catch (Exception e) {
			e.printStackTrace();
		}

		String csv_text = csv_buffer.toString();
		System.out.println("Saving OpenNLP analysis as CVS file:");
		System.out.println("  " + cvs_classification_filename);
		UtilIO.writeTextToFile(cvs_classification_filename, csv_text);
		
	}

	public static void perVolume(String volume_text_input_base_dir, String data_output_base_dir)
	{
		String cvs_classification_filename = data_output_base_dir + File.separator + "maori-volume-classification-opennlp.csv";
		StringBuffer csv_buffer = new StringBuffer();
		
		ArrayList<String> csv_items = new ArrayList<String>();
		csv_items.add("vol-id");
		csv_items.add("predicted-lang");
		csv_items.add("confidence");
		csv_items.add("is-moari");
		
		String csv_items_line = String.join(",", csv_items);
		csv_buffer.append(csv_items_line+"\n");
		
		try {
			maoriTextDetector = new LanguageTextDetector(false, minConfidence); // runSilent=false
	
			Files.walk(Paths.get(volume_text_input_base_dir))
			.filter(Files::isRegularFile)
			.forEach(item -> OpenNLPLanguageVolumeClassification.ClassifyVolumeLRL(item, csv_buffer) );
			
		}
		catch (Exception e) {
			e.printStackTrace();
		}

		String csv_text = csv_buffer.toString();
		System.out.println("Saving OpenNLP analysis as CVS file:");
		System.out.println("  " + cvs_classification_filename);
		UtilIO.writeTextToFile(cvs_classification_filename, csv_text);
		
	}
	
	public static void main(String[] args) 
	{	
		System.setProperty("lrl.home", UtilIO.LRL_HOME );
		
		String data_output_base_dir = UtilIO.LRL_HOME + File.separator + "data-output"; 
		String volume_text_input_base_dir = data_output_base_dir + File.separator + "volume-ef-text";

		id_count = 0;
		perPage(volume_text_input_base_dir,data_output_base_dir);
		
		id_count = 0;
		perVolume(volume_text_input_base_dir,data_output_base_dir);
			
		
		System.out.flush();
		System.err.println("Done!");
	
	}

}
