package org.hathitrust.lrl.core;

import java.io.File;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.lang3.StringUtils;

public class SeedWords 
{
	public static int DEBUG_LEVEL = 2;
	
	public static Set<String> convertListToSet(List<String> input_words)
	{
		Set<String> output_dict = new HashSet<String>(); 

		for (String word: input_words) {
			output_dict.add(word);

		}

		return output_dict;
	}

	public static ArrayList<String> removeOneAndTwoLetterWords(ArrayList<String> input_words)
	{
		ArrayList<String> output_words = new ArrayList<String>();
		ArrayList<String> filtered_out_words = new ArrayList<String>();;
		
		for (String word: input_words) {
			if (word.length() > 2) {
				output_words.add(word);
			}
			else {
				filtered_out_words.add(word);
			}
		}
		
		if (DEBUG_LEVEL >= 1) {
			int size_of_lrl_word_list = input_words.size();	
			int size_of_filtered_lrl_word_list = output_words.size();
			
			int num_words_filtered = size_of_lrl_word_list - size_of_filtered_lrl_word_list;
								
			System.out.println("Removing one-letter and two-letter words:");
			System.out.println("  Starting low-resource language word list size: " + size_of_lrl_word_list);
			System.out.println("  Resulting low-resource language word list size: " + size_of_filtered_lrl_word_list);
			System.out.println("  => number of filtered out words: " + num_words_filtered);
		}
		if (DEBUG_LEVEL >= 2) {
			System.out.println("  Filtered words were: " + String.join(",",  filtered_out_words));
		}
		
		return output_words;
	}
	
	
	public static ArrayList<String> removeLoanwords(ArrayList<String> input_words, Set<String> english_dict)
	{
		ArrayList<String> output_words = new ArrayList<String>();
		ArrayList<String> filtered_out_words = new ArrayList<String>();;
		
		for (String word: input_words) {

			String word_lc = word.toLowerCase();
			if (english_dict.contains(word_lc)) {
				if (DEBUG_LEVEL >= 3) {
					System.out.println("*** Found in English dictionary, skipping: " + word);
				}
				filtered_out_words.add(word);
			}
			else {
				// Check to see if accented, and if it is, remove accents and look again in dictionary
				
				String opt_word_lc_without_accents = StringUtils.stripAccents(word_lc);
				if (english_dict.contains(opt_word_lc_without_accents)) {
					if (DEBUG_LEVEL >= 3) {
						System.out.println("*** Found unaccented LRL word in English dictionary, removing: " + word);
					}
					filtered_out_words.add(word);
				}
				else {
					output_words.add(word);
				}
			}
		}
		
		if (DEBUG_LEVEL >= 1) {
			int size_of_english_dict = english_dict.size();
			int size_of_lrl_word_list = input_words.size();	
			int size_of_filtered_lrl_word_list = output_words.size();
			
			int num_words_filtered = size_of_lrl_word_list - size_of_filtered_lrl_word_list;
			
					
			System.out.println("English dictionary cross-check:");
			System.out.println("  Used an English dictionary of size: " + size_of_english_dict);
			System.out.println("  Starting low-resource language word list size: " + size_of_lrl_word_list);
			System.out.println("  Resulting low-resource language word list size: " + size_of_filtered_lrl_word_list);
			System.out.println("  => number of filtered out (loan) words: " + num_words_filtered);
		}
		if (DEBUG_LEVEL >= 2) {
			System.out.println("  Filtered words were: " + String.join(",",  filtered_out_words));
		}
		
		return output_words;
	}
	
	public static ArrayList<String> extractAccentedWords(ArrayList<String> input_words, Set<String> english_dict)
	{
		ArrayList<String> accented_words = new ArrayList<String>();
				
		int i = 0;
		while (i<input_words.size()) {
			String word = input_words.get(i);
			String opt_word_without_accents = StringUtils.stripAccents(word);
			
			String opt_word_without_accents_lc = opt_word_without_accents.toLowerCase();
			if (english_dict.contains(opt_word_without_accents_lc)) {
				System.out.println("*** Found unaccented LRL word in English dictionary, removing: " + word);
				input_words.remove(i);
			}
			else {
				if (!word.equals(opt_word_without_accents)) {
					input_words.set(i, opt_word_without_accents);
					accented_words.add(word);
				}

				i++;
			}
		}
		 
		return accented_words;
	}

	public static void printWords(String label, ArrayList<String> words)
	{
	
		System.out.println("=====");
		System.out.println(label + ":");
		System.out.println("=====");
		
		for (String word: words) {
			System.out.println(word);
		}
	}
	
	protected ArrayList<String> _english_words;
	protected ArrayList<String> _freq_lrl_words;
	protected ArrayList<String> _freq_lrl_words_mincharlen;
	protected ArrayList<String> _filtered_lrl_words;
	protected ArrayList<String> _lrl_words_with_accents;
	
	public SeedWords(String LRL_HOME)
	{
		// English dictionary from:
		//   https://raw.githubusercontent.com/dwyl/english-words/master/words.txt
		String english_dict_filename = LRL_HOME + File.separator + "etc" + File.separator + "english-words-dict.txt";

		_english_words = UtilIO.readLineBasedTextFile(english_dict_filename,UtilIO.CaseProcessing.ToLower);
		Set<String> english_dict = SeedWords.convertListToSet(_english_words);

		// Maori 1000 frequent word list from:
		//   https://tereomaori.tki.org.nz/Teacher-tools/Te-Whakaipurangi-Rauemi/High-frequency-word-lists
		String maori_words_filename = LRL_HOME + File.separator + "etc" + File.separator + "maori-words-1000.txt";

		_freq_lrl_words = UtilIO.readLineBasedTextFile(maori_words_filename,UtilIO.CaseProcessing.AsIs);

		_freq_lrl_words_mincharlen = SeedWords.removeOneAndTwoLetterWords(_freq_lrl_words);
		
		_filtered_lrl_words = SeedWords.removeLoanwords(_freq_lrl_words_mincharlen,english_dict);
		_lrl_words_with_accents = SeedWords.extractAccentedWords(_filtered_lrl_words,english_dict);

		
		if (DEBUG_LEVEL >= 1) {
			System.out.println("Number of low-resource words that use accents: " + _lrl_words_with_accents.size());
		}
		if (DEBUG_LEVEL >= 2) {
			System.out.println("  Accented words are: " + String.join(",",  _lrl_words_with_accents));
		}
		
		if (DEBUG_LEVEL >= 3) {
			printWords("Filtered words without accents", _filtered_lrl_words);
			printWords("Just the accented words", _lrl_words_with_accents);
		}
	}
	
	public ArrayList<String> getFilteredWords()
	{
		return _filtered_lrl_words;
	}
}
