package org.greenstone.atlas.server;


import java.util.HashMap;
import java.util.ArrayList;

public class MarkupService 
{
	/**
	 * Takes the given text and searches it for place names and then marks up any matches
	 * @param originalText is the given text
	 * @return a marked up version of the text
	 */
	public String getMarkedUpText(String originalText)
	{
		//Get the gazetteer
		GazetteerTrieType2 gazetteer = new GazetteerTrieType2("/research/sjm84/Msc/Downloads/dataen.txt");
		
		removeAmbiguousPlaceNames(gazetteer);
		
		//Find the words in the text
		ArrayList<String> words = findWords(originalText);
		
		//Find the matches
		HashMap<Integer, Integer> matches = findMatches(gazetteer, words);
		
		//Return the marked up text
		return createMarkedUpText(words, matches, originalText);
	}
	
	/**
	 * Takes the given text, list of words and list of matches to create marked up text
	 * @param words is the list of words in the text
	 * @param matches is the list of gazetteer matches in the text
	 * @param originalText is the orignal text that is to be marked up
	 * @return the marked up text
	 */
	protected String createMarkedUpText(ArrayList<String> words, HashMap<Integer, Integer> matches, String originalText)
	{
		StringBuilder markedUpText = new StringBuilder();
		
		//Used to count what word in words is being used
		int wordCount = 0;
		
		//Go through each character 
		int i = 0;
		while(i < originalText.length())
		{
			//If the current character in the text is a letter
			if(Character.isLetter(originalText.charAt(i)))
			{
				Integer numOfWordsInPlaceName = matches.get(wordCount);
				
				//If the word is a place name
				if(numOfWordsInPlaceName != null)
				{
					int endingCharacters = 0;
					StringBuilder placeName = new StringBuilder();
					
					//Get each word in the place name
					for(int j = 0; j < numOfWordsInPlaceName; j++)
					{
						//Add the word to the place name
						placeName.append(words.get(wordCount));
						
						//Move the corresponding amount of characters ahead 
						i += words.get(wordCount).length();
						wordCount++;
						
						Character c = null;
						endingCharacters = 0;
						
						//Add any characters between the words in the place name
						while(i < originalText.length() && !Character.isLetter(c = originalText.charAt(i)))
						{
							placeName.append(c);
							i++;
							endingCharacters++;
						}
					}	
					
					//Mark up the place name
					markedUpText.append("<span style=\"background-color:yellow\">"
							+ placeName.substring(0, placeName.length() - endingCharacters) 
							+ "</span>" 
							+ placeName.substring(placeName.length() - endingCharacters, placeName.length()));
				}
				//If the word is not a place name
				else
				{	
					//Add it to the marked up text as is
					String word = words.get(wordCount++);
					markedUpText.append(word);
					i += word.length();
				}
			}
			//If the current character in the text is not a letter
			else
			{
				//Add it to the marked up text as is
				markedUpText.append(originalText.charAt(i));
				i++;
			}
		}
		
		return markedUpText.toString();
	}
	
	/**
	 * Searches the words of a text for matches in the gazetteer
	 * @param gaz is the gazetter (in trie form) to use
	 * @param text is the list of words to search
	 * @return a list of matches in the format <[first word index], [number of words in match]>
	 */
	protected HashMap<Integer, Integer> findMatches(GazetteerTrieType2 gaz, ArrayList<String> words)
	{
		HashMap<Integer, Integer> matches = new HashMap<Integer, Integer>();
		
		//Go through every word in the list of words
		for(int i = 0; i < words.size(); i++)
		{
			int j = i;
			int result = 0;
			StringBuilder currentWord = new StringBuilder();
			
			//Until a dead end in the trie is reached or there is no more words to append
			while(j < words.size())
			{
				if(j == i && Character.isLowerCase(words.get(j).charAt(0)))
				{
					break;
				}
				
				currentWord = currentWord.append(words.get(j));
				
				//Check the words in the gazetteer to see if there is a match
				result = gaz.checkPlaceName(currentWord.toString());
				
				//If there is a match
				if(result == 1)
				{
					//Store the match in the form <[first word index], [number of words in match]>
					//Because shorter words are checked first longer matches will overwrite shorter matches
					matches.put(i, (j-i) + 1);
				}
				else if(result == -1)
				{
					break;
				}
				j++;
				currentWord = currentWord.append(" ");
			}
		}
		
		//Remove any overlapping matches and return
		return correctMatches(matches);
	}
	
	/**
	 * Takes an array of matches and removes any that overlap
	 * @param matches is the array of matches
	 * @return the corrected array of matches
	 */
	protected HashMap<Integer, Integer> correctMatches(HashMap<Integer, Integer> matches)
	{
		Integer[] keys = matches.keySet().toArray(new Integer[0]);
		
		//Go through all the keys in the hash map
		for(int i = 0; i < keys.length-1; i++)
		{
			//Ignore keys that have been removed
			if(matches.get(keys[i]) == null)
			{
				continue;
			}
			
			//Check and see if the current key overlaps any other keys
			for(int j = keys[i] + 1; j < keys[i] + matches.get(keys[i]); j++)
			{
				//If so, remove the keys it overlaps
				if(matches.get(j) != null)
				{
					matches.remove(j);
				}
			}
		}
		
		return matches;
	}
	
	/**
	 * Divides the given text into words and returns the list of words
	 * @param text is the text to divide
	 * @return the divided text
	 */
	protected static ArrayList<String> findWords(String text)
	{
		ArrayList<String> words = new ArrayList<String>();		
		StringBuilder currentWord = new StringBuilder();
		
		//Go through each character in the text
		for(int i = 0; i < text.length(); i++)
		{
			//If it is a letter then add it to the current word
			if(Character.isLetter(text.charAt(i)) || text.charAt(i) == '-')
			{
				currentWord.append(text.charAt(i));
			}
			//If it is not a letter
			else
			{
				//And a word currently is currently being created
				if(currentWord.length() > 0 && text.charAt(i - currentWord.length() + 1) != '\'')
				{
					//Add the current word to the list of words
					words.add(currentWord.toString());	
					
					//Delete the current words
					currentWord.delete(0, currentWord.length());
				}
			}
		}
		
		//Add the final word of the text
		if(currentWord.length() > 0)
		{
			words.add(currentWord.toString());
		}
		
		return words;
	}
	
	/**
	 * Removes place names that are unlikely to be meant as place names in a given text
	 * @param gazetteer is the gazetteer to remove the place names from
	 */
	public void removeAmbiguousPlaceNames(GazetteerTrieType2 gazetteer)
	{
		gazetteer.removePlaceName("are");			gazetteer.removePlaceName("is");
		gazetteer.removePlaceName("over");			gazetteer.removePlaceName("at");
		gazetteer.removePlaceName("of");			gazetteer.removePlaceName("to");
		gazetteer.removePlaceName("rule");			gazetteer.removePlaceName("time");
		gazetteer.removePlaceName("real");			gazetteer.removePlaceName("national");
		gazetteer.removePlaceName("early");			gazetteer.removePlaceName("by");
		gazetteer.removePlaceName("as");			gazetteer.removePlaceName("eastern");
		gazetteer.removePlaceName("western");		gazetteer.removePlaceName("southern");
		gazetteer.removePlaceName("northern");		gazetteer.removePlaceName("east");
		gazetteer.removePlaceName("west");			gazetteer.removePlaceName("south");
		gazetteer.removePlaceName("north");			gazetteer.removePlaceName("this");
		gazetteer.removePlaceName("between");		gazetteer.removePlaceName("many");
		gazetteer.removePlaceName("strong");		gazetteer.removePlaceName("economy");
		gazetteer.removePlaceName("mall");			gazetteer.removePlaceName("they");
		gazetteer.removePlaceName("do");			gazetteer.removePlaceName("image");
		gazetteer.removePlaceName("republic");		gazetteer.removePlaceName("section");
		gazetteer.removePlaceName("dollar");		gazetteer.removePlaceName("index");
		gazetteer.removePlaceName("day");			gazetteer.removePlaceName("council");
		gazetteer.removePlaceName("use");			gazetteer.removePlaceName("log");
		gazetteer.removePlaceName("logo");			gazetteer.removePlaceName("best");
		gazetteer.removePlaceName("go");			gazetteer.removePlaceName("portal");
		gazetteer.removePlaceName("list");			gazetteer.removePlaceName("english");
		gazetteer.removePlaceName("page");			gazetteer.removePlaceName("see");
		gazetteer.removePlaceName("ocean");			gazetteer.removePlaceName("island");
		gazetteer.removePlaceName("x");				gazetteer.removePlaceName("country");
		gazetteer.removePlaceName("colony");		gazetteer.removePlaceName("christian");
		gazetteer.removePlaceName("black");			gazetteer.removePlaceName("independence");
		gazetteer.removePlaceName("war");			gazetteer.removePlaceName("no");
		gazetteer.removePlaceName("continental");	gazetteer.removePlaceName("continental");
		gazetteer.removePlaceName("force");			gazetteer.removePlaceName("reform");
		gazetteer.removePlaceName("rush");			gazetteer.removePlaceName("read");
		gazetteer.removePlaceName("none");			gazetteer.removePlaceName("justice");
		gazetteer.removePlaceName("font");			gazetteer.removePlaceName("u");
		gazetteer.removePlaceName("y");				gazetteer.removePlaceName("normal");
		gazetteer.removePlaceName("center");		gazetteer.removePlaceName("date");
		gazetteer.removePlaceName("story");			gazetteer.removePlaceName("union");
		gazetteer.removePlaceName("supreme");		gazetteer.removePlaceName("house");
		gazetteer.removePlaceName("court");			gazetteer.removePlaceName("data");
		gazetteer.removePlaceName("energy");		gazetteer.removePlaceName("white");
		gazetteer.removePlaceName("universal");		gazetteer.removePlaceName("protection");
		gazetteer.removePlaceName("great");			gazetteer.removePlaceName("star");
		gazetteer.removePlaceName("banner");		gazetteer.removePlaceName("capital");
		gazetteer.removePlaceName("much");
		
		gazetteer.addPlaceName("United States");
	}
}
