package org.greenstone.atlas.server; import java.util.HashMap; import java.util.ArrayList; public class MarkupService { /** * Takes the given text and searches it for place names and then marks up any matches * @param originalText is the given text * @return a marked up version of the text */ public String getMarkedUpText(String originalText) { //Get the gazetteer GazetteerTrieType2 gazetteer = new GazetteerTrieType2("/research/sjm84/Msc/Downloads/dataen.txt"); removeAmbiguousPlaceNames(gazetteer); //Find the words in the text ArrayList words = findWords(originalText); //Find the matches HashMap matches = findMatches(gazetteer, words); //Return the marked up text return createMarkedUpText(words, matches, originalText); } /** * Takes the given text, list of words and list of matches to create marked up text * @param words is the list of words in the text * @param matches is the list of gazetteer matches in the text * @param originalText is the orignal text that is to be marked up * @return the marked up text */ protected String createMarkedUpText(ArrayList words, HashMap matches, String originalText) { StringBuilder markedUpText = new StringBuilder(); //Used to count what word in words is being used int wordCount = 0; //Go through each character int i = 0; while(i < originalText.length()) { //If the current character in the text is a letter if(Character.isLetter(originalText.charAt(i))) { Integer numOfWordsInPlaceName = matches.get(wordCount); //If the word is a place name if(numOfWordsInPlaceName != null) { int endingCharacters = 0; StringBuilder placeName = new StringBuilder(); //Get each word in the place name for(int j = 0; j < numOfWordsInPlaceName; j++) { //Add the word to the place name placeName.append(words.get(wordCount)); //Move the corresponding amount of characters ahead i += words.get(wordCount).length(); wordCount++; Character c = null; endingCharacters = 0; //Add any characters between the words in the place name while(i < originalText.length() && !Character.isLetter(c = originalText.charAt(i))) { placeName.append(c); i++; endingCharacters++; } } //Mark up the place name markedUpText.append("" + placeName.substring(0, placeName.length() - endingCharacters) + "" + placeName.substring(placeName.length() - endingCharacters, placeName.length())); } //If the word is not a place name else { //Add it to the marked up text as is String word = words.get(wordCount++); markedUpText.append(word); i += word.length(); } } //If the current character in the text is not a letter else { //Add it to the marked up text as is markedUpText.append(originalText.charAt(i)); i++; } } return markedUpText.toString(); } /** * Searches the words of a text for matches in the gazetteer * @param gaz is the gazetter (in trie form) to use * @param text is the list of words to search * @return a list of matches in the format <[first word index], [number of words in match]> */ protected HashMap findMatches(GazetteerTrieType2 gaz, ArrayList words) { HashMap matches = new HashMap(); //Go through every word in the list of words for(int i = 0; i < words.size(); i++) { int j = i; int result = 0; StringBuilder currentWord = new StringBuilder(); //Until a dead end in the trie is reached or there is no more words to append while(j < words.size()) { if(j == i && Character.isLowerCase(words.get(j).charAt(0))) { break; } currentWord = currentWord.append(words.get(j)); //Check the words in the gazetteer to see if there is a match result = gaz.checkPlaceName(currentWord.toString()); //If there is a match if(result == 1) { //Store the match in the form <[first word index], [number of words in match]> //Because shorter words are checked first longer matches will overwrite shorter matches matches.put(i, (j-i) + 1); } else if(result == -1) { break; } j++; currentWord = currentWord.append(" "); } } //Remove any overlapping matches and return return correctMatches(matches); } /** * Takes an array of matches and removes any that overlap * @param matches is the array of matches * @return the corrected array of matches */ protected HashMap correctMatches(HashMap matches) { Integer[] keys = matches.keySet().toArray(new Integer[0]); //Go through all the keys in the hash map for(int i = 0; i < keys.length-1; i++) { //Ignore keys that have been removed if(matches.get(keys[i]) == null) { continue; } //Check and see if the current key overlaps any other keys for(int j = keys[i] + 1; j < keys[i] + matches.get(keys[i]); j++) { //If so, remove the keys it overlaps if(matches.get(j) != null) { matches.remove(j); } } } return matches; } /** * Divides the given text into words and returns the list of words * @param text is the text to divide * @return the divided text */ protected static ArrayList findWords(String text) { ArrayList words = new ArrayList(); StringBuilder currentWord = new StringBuilder(); //Go through each character in the text for(int i = 0; i < text.length(); i++) { //If it is a letter then add it to the current word if(Character.isLetter(text.charAt(i)) || text.charAt(i) == '-') { currentWord.append(text.charAt(i)); } //If it is not a letter else { //And a word currently is currently being created if(currentWord.length() > 0 && text.charAt(i - currentWord.length() + 1) != '\'') { //Add the current word to the list of words words.add(currentWord.toString()); //Delete the current words currentWord.delete(0, currentWord.length()); } } } //Add the final word of the text if(currentWord.length() > 0) { words.add(currentWord.toString()); } return words; } /** * Removes place names that are unlikely to be meant as place names in a given text * @param gazetteer is the gazetteer to remove the place names from */ public void removeAmbiguousPlaceNames(GazetteerTrieType2 gazetteer) { gazetteer.removePlaceName("are"); gazetteer.removePlaceName("is"); gazetteer.removePlaceName("over"); gazetteer.removePlaceName("at"); gazetteer.removePlaceName("of"); gazetteer.removePlaceName("to"); gazetteer.removePlaceName("rule"); gazetteer.removePlaceName("time"); gazetteer.removePlaceName("real"); gazetteer.removePlaceName("national"); gazetteer.removePlaceName("early"); gazetteer.removePlaceName("by"); gazetteer.removePlaceName("as"); gazetteer.removePlaceName("eastern"); gazetteer.removePlaceName("western"); gazetteer.removePlaceName("southern"); gazetteer.removePlaceName("northern"); gazetteer.removePlaceName("east"); gazetteer.removePlaceName("west"); gazetteer.removePlaceName("south"); gazetteer.removePlaceName("north"); gazetteer.removePlaceName("this"); gazetteer.removePlaceName("between"); gazetteer.removePlaceName("many"); gazetteer.removePlaceName("strong"); gazetteer.removePlaceName("economy"); gazetteer.removePlaceName("mall"); gazetteer.removePlaceName("they"); gazetteer.removePlaceName("do"); gazetteer.removePlaceName("image"); gazetteer.removePlaceName("republic"); gazetteer.removePlaceName("section"); gazetteer.removePlaceName("dollar"); gazetteer.removePlaceName("index"); gazetteer.removePlaceName("day"); gazetteer.removePlaceName("council"); gazetteer.removePlaceName("use"); gazetteer.removePlaceName("log"); gazetteer.removePlaceName("logo"); gazetteer.removePlaceName("best"); gazetteer.removePlaceName("go"); gazetteer.removePlaceName("portal"); gazetteer.removePlaceName("list"); gazetteer.removePlaceName("english"); gazetteer.removePlaceName("page"); gazetteer.removePlaceName("see"); gazetteer.removePlaceName("ocean"); gazetteer.removePlaceName("island"); gazetteer.removePlaceName("x"); gazetteer.removePlaceName("country"); gazetteer.removePlaceName("colony"); gazetteer.removePlaceName("christian"); gazetteer.removePlaceName("black"); gazetteer.removePlaceName("independence"); gazetteer.removePlaceName("war"); gazetteer.removePlaceName("no"); gazetteer.removePlaceName("continental"); gazetteer.removePlaceName("continental"); gazetteer.removePlaceName("force"); gazetteer.removePlaceName("reform"); gazetteer.removePlaceName("rush"); gazetteer.removePlaceName("read"); gazetteer.removePlaceName("none"); gazetteer.removePlaceName("justice"); gazetteer.removePlaceName("font"); gazetteer.removePlaceName("u"); gazetteer.removePlaceName("y"); gazetteer.removePlaceName("normal"); gazetteer.removePlaceName("center"); gazetteer.removePlaceName("date"); gazetteer.removePlaceName("story"); gazetteer.removePlaceName("union"); gazetteer.removePlaceName("supreme"); gazetteer.removePlaceName("house"); gazetteer.removePlaceName("court"); gazetteer.removePlaceName("data"); gazetteer.removePlaceName("energy"); gazetteer.removePlaceName("white"); gazetteer.removePlaceName("universal"); gazetteer.removePlaceName("protection"); gazetteer.removePlaceName("great"); gazetteer.removePlaceName("star"); gazetteer.removePlaceName("banner"); gazetteer.removePlaceName("capital"); gazetteer.removePlaceName("much"); gazetteer.addPlaceName("United States"); } }