package org.hathitrust.lrl.pagelevel;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;

import org.apache.commons.lang3.StringUtils;
import org.hathitrust.lrl.core.LanguageTextDetector;
import org.hathitrust.lrl.core.UtilIO;

import opennlp.tools.langdetect.Language;

public class OpenNLPLanguageClassification {

	protected static String MAORI_3LETTER_CODE = "mri";
	protected static double minConfidence = 0.9;
	
	protected static boolean DEBUG = false;
	
	protected static LanguageTextDetector maoriTextDetector;
	
	protected static int id_count = 0;
	
	public static void ClassifyLRL(Path input_path, StringBuffer csv_buffer)
	{
		String input_filename = input_path.toString();

		if (input_filename.endsWith(".txt")) {
			
			id_count++;
			
			if ((DEBUG) && (id_count>=10)) {
				if (id_count==10) {
					System.out.flush();
					System.err.println("*** Cutting CVS output short");
				}
				return;
			}
			
			Path just_filename = input_path.getFileName();
			String volpage_id_filesafe = just_filename.toString().replaceAll("\\.[^\\.]+$", "");
			String volpage_id  = volpage_id_filesafe.replaceAll("%2F", "/").replaceAll("%3A", ":");

			String vol_id = volpage_id.replaceAll("-seq-[0-9]+$","");
			String page_id = volpage_id.replaceAll("^"+vol_id+"-seq-", "");
			try {
				
				System.out.println("Classifying full-text page #" + id_count + ": " + input_filename);
				
				
				// Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
				try (BufferedReader reader = new BufferedReader(new FileReader(input_filename))) {
					
					String text = UtilIO.readTextFile(input_filename);
					
					ArrayList<String> csv_items = new ArrayList<String>();
					
					if (!StringUtils.isBlank(text)) {

						Language opennlp_predicted_lang = maoriTextDetector.predictLanguage(text);
						String lang = opennlp_predicted_lang.getLang();
						Double conf = opennlp_predicted_lang.getConfidence();

						Boolean is_maori = (lang.equals(MAORI_3LETTER_CODE) && conf >= minConfidence);

						System.out.println("  Best language: " + lang + ", confidence = " + conf);

						csv_items.add(volpage_id);
						csv_items.add(vol_id);
						csv_items.add(page_id);
						csv_items.add(lang);
						csv_items.add(conf.toString());
						csv_items.add(is_maori.toString());
					}
					else {
						System.err.println("Warning: Volume-page id '"+volpage_id+"' contains no text");

						csv_items.add(volpage_id);
						csv_items.add(vol_id);
						csv_items.add(page_id);
						csv_items.add("undefined");
						csv_items.add("0.0");
						csv_items.add("UNDEFINED");
					}
					
					String csv_items_line = String.join(",", csv_items);
					csv_buffer.append(csv_items_line+"\n");
					
				} // let outer try deal with any file/reading exceptions
				
				
			}
			catch (Exception e) {
				e.printStackTrace();
			}
		}
	}
	
	
	public static void main(String[] args) 
	{	
		System.setProperty("lrl.home", UtilIO.LRL_HOME );
		
		String data_output_base_dir = UtilIO.LRL_HOME + File.separator + "data-output"; 
		String page_text_base_dir = data_output_base_dir + File.separator + "page-ef-text";

		String cvs_classification_filename = data_output_base_dir + File.separator + "maori-fulltext-page-classification-opennlp.csv";
		StringBuffer csv_buffer = new StringBuffer();
		
		ArrayList<String> csv_items = new ArrayList<String>();
		csv_items.add("vol-page-id");
		csv_items.add("vol-id");
		csv_items.add("page-id");
		csv_items.add("predicted-lang");
		csv_items.add("confidence");
		csv_items.add("is-moari");
		
		String csv_items_line = String.join(",", csv_items);
		csv_buffer.append(csv_items_line+"\n");
		
		try {
			maoriTextDetector = new LanguageTextDetector(false, minConfidence); // runSilent=false
	
			Files.walk(Paths.get(page_text_base_dir))
			.filter(Files::isRegularFile)
			.forEach(item -> OpenNLPLanguageClassification.ClassifyLRL(item, csv_buffer) );
			
		}
		catch (Exception e) {
			e.printStackTrace();
		}

		System.out.println();
		System.out.println("Outputing results to:");
		System.out.println("  " + cvs_classification_filename);
		String csv_text = csv_buffer.toString();
		UtilIO.writeTextToFile(cvs_classification_filename, csv_text);
		
	
		System.out.flush();
		System.err.println("Done!");
	
	}

}
