package org.greenstone.applet.GsdlCollageApplet;

import java.io.*;
import java.net.*;
import java.util.*;

/** Examines html pages and extracts all the images and links */
public class CURL {

    private boolean url_valid = true;
    private InputStream input = null;
    private int peek_value = -1;
    private String buffer = "";
    private URL url = null;
    private Vector href_links = null;
    private Vector src_links = null;
    private Vector link_links = null;
    private Vector background_links = null;

    /** Starts processing the given url for images and links
     *  @param url_str The url to examine */
    public CURL(String url_str) {
	href_links = new Vector();
	src_links = new Vector();
	link_links = new Vector();
	background_links = new Vector();

	try {
	     url = new URL(url_str);
	     input = url.openStream();
	     //  int value = 0;

	     //            while ( (value=input.read())!=-1){
	     //System.err.print((char)value);
	     //}
             
	     //url = new URL(url_str);
	     //input = url.openStream(); 

	} 
	catch (MalformedURLException e) {
          
	    url_valid = false;
	} 
	catch (IOException e) {
	 
	    url_valid = false;
	}
    }

    /** Starts processing the given url for images and links
     * This variant of the constructor is needed for GS3.
     *  @param url_str The url to examine */
    public CURL(String url_str, String base_url) {
	href_links = new Vector();
	src_links = new Vector();
	link_links = new Vector();
	background_links = new Vector();

	try {
	     url = new URL(url_str);	     
	     input = url.openStream();
	     // hereafter, we will build up URLs by concatenating to baseURL not url_str
	     url = new URL(base_url);
	} 
	catch (MalformedURLException e) {
          
	    url_valid = false;
	} 
	catch (IOException e) {
	 
	    url_valid = false;
	}
    }

    /** Checks that a valid connection to the url has been made */
    public boolean connected_ok()
    {
	return url_valid;
    }

    /** Gets any href links from this url
     *  @return Vector of href links */
    public Vector getHrefLinks() {
	return href_links;
    }
    /** Gets any source links from this url
     *  @return Vector of source links */
    public Vector getSrcLinks() {
	return src_links;
    }
    /** Gets any other links from this url
     *  @return Vector of other links */
    public Vector getLinkLinks() {
	return link_links;
    }
    /** Gets any background links from this url
     *  @return Vector of background links */
    public Vector getBackgroundLinks() {
	return background_links;
    }
    
    /** Gets the url currently being processed */
    public URL getURL() {
	return url;
    }
    /** Checks that the content of the url is in html */
    public boolean isHTML() {

         String content_type = guessContentType(url.toString());
   
	if(content_type.startsWith("text/html")) {
	    return true;
	}
	return false;
    }
    
    /** Reads a value from the buffer
     *  @return Value read if successful and -1 if not */
    public int read() {
	int value = -1;
	if(isHTML()) {
	    if(buffer.length() == 0) {
		refill();
	    }
	    if(buffer.length() != 0) {
		value = getBuffer();
	    }
	} else {
	    value = getRaw();
	}
	return value;
    }
    /** Reads the entire URL */
    public void readAll() {
	int value;
	//while((value = read()) != -1) {
	while(!Thread.currentThread().isInterrupted() && (value = read()) != -1) {	
	}
	try {
	    input.close();
	} catch(IOException ioex) {
	    System.err.println("CURL tried to close the inputstream, but got exception " + ioex);
	} finally {
	    if(Thread.currentThread().isInterrupted()) {
		System.err.println("**** CURL.readAll() had been interrupted.");
	    }	    
	}
    }
    
    // Gets the head of the buffered buffer.
    private int getBuffer() {
	if(buffer.length() > 0) {
	    int value = buffer.charAt(0);
	    buffer = buffer.substring(1, buffer.length());
	    return value;
	} else {
	    System.err.println("Called getRaw on an empty string");
	    return -1;
	}
    }
    // Gets the head of the raw buffer.
    private int getRaw() {
	int value = -1;
	if(peek_value != -1) {
	    value = peek_value;
	    peek_value = -1;
	} 
	else {
	    try {
		value = input.read(); // what if this blocks?? How can this be made interruptible?
	    } catch (Exception e) {
		e.printStackTrace();
	    }
	}
	return value;
    }
    
    private int peekRaw() {
	if(peek_value == -1) {
	    peek_value = getRaw();
	}
	return peek_value;
    }
    
    // Refills the buffered buffer with the next tag or non-tag block
    // The tag is checked for urls. Note a tag is taken to be < .. > or
    // < .. < so comments are supported, but comment blocks are still
    // scanned.
    private void refill() {
	int value = getRaw();
	if(value != -1) {
           
	    if(value == '<') {
		//System.err.println("Parsing a tag starting " + (char)value);
		// Add opening < to buffer
		setBuffer(value);
		String tag = "";
		value = getRaw();
		while(value != -1 && peekRaw() != '<' && value != '>') {
		    //System.err.print((char)value);
		    tag = tag + (char) value;
		    value = getRaw();
		}
	
		//System.err.println("Read a " + (char)value);
		//tag = smartLower(tag);
		//System.err.println("tag "+tag);
                tag = findURL(tag);
		buffer = buffer + tag;
		// Add closing > to buffer
		setBuffer(value);
		//System.err.println("Finished tag");
	    } else {
		//System.err.println("Parsing content");
		//System.err.println("Value = " + value + " = '" + (char)value + "'");
		while(value != -1 && value != '<') {
		    //System.err.println("Read a '" + (char)value + "'");
		    setBuffer(value);
		    value = getRaw();
		}
		// If we've accidently read the '<' push it back in the stream by
		// setting peek_value to value. Since the peek_value will be returned
		// on the next read this has the desired effect.
		if(value == '<') {
		    peek_value = value;
		}
		//System.err.println("Read " + buffer);
		//System.err.println("Finished Content");
	    }
	}
    }
    
    // Sets the tail of the buffered buffer.
    private void setBuffer(int value) {
	buffer = buffer + (char) value;
    }
    
    private String smartLower(String tag) {
	boolean lower = true;
	String new_tag = "";
	for(int i = 0; i < tag.length(); i++) {
				// Disable case lowering for value tags (bound by "")
	    if (tag.charAt(i) == '"') {
		if(lower) {
		    lower = false;
		} else {
		    lower = true;
		}
	    }
				// Lower everything else
	    if(lower) {
		new_tag = new_tag + Character.toLowerCase(tag.charAt(i));
	    } else {
		new_tag = new_tag + tag.charAt(i);
	    }
	}
	return new_tag;
    }
    
    private final static int GROUND           = 0;
    private final static int COMMENT          = 5;
    private final static int COMMENT_DASH     = 6;
    private final static int COMMENT_FINAL    = 7;
    private final static int H                = 11;
    private final static int HR               = 12;
    private final static int HRE              = 13;
    private final static int HREF             = 14;
    private final static int HREF_EQUAL       = 15;
    private final static int HREF_Q           = 16;
    private final static int HREF_NQ          = 17;
    private final static int HREF_FINAL       = 18;
    private final static int S                = 23;
    private final static int SR               = 24;
    private final static int SRC              = 25;
    private final static int SRC_EQUAL        = 26;
    private final static int SRC_Q            = 27;
    private final static int SRC_NQ           = 28;
    private final static int SRC_FINAL        = 29;
    private final static int L                = 67;
    private final static int LI               = 68;
    private final static int LIN              = 69;
    private final static int LINK             = 70;
    private final static int LINK_QUOTE       = 72;
    private final static int LINK_H           = 73;
    private final static int LINK_HR          = 74;
    private final static int LINK_HRE         = 75;
    private final static int LINK_HREF        = 76;
    private final static int LINK_EQUAL       = 77;
    private final static int LINK_Q           = 78;
    private final static int LINK_NQ          = 79;
    private final static int LINK_FINAL       = 80;
    private final static int B                = 85;
    private final static int BA               = 86;
    private final static int BAC              = 87;
    private final static int BACK             = 88;
    private final static int BACKG            = 89;
    private final static int BACKGR           = 90;
    private final static int BACKGRO          = 91;
    private final static int BACKGROU         = 92;
    private final static int BACKGROUN        = 93;
    private final static int BACKGROUND       = 94;
    private final static int BACKGROUND_EQUAL = 95;
    private final static int BACKGROUND_Q     = 96;
    private final static int BACKGROUND_NQ    = 97;
    private final static int BACKGROUND_FINAL = 98;
    private final static int FINAL            = 99;
    
    // Run the finite-state machine on a buffer-load.
    private String findURL(String tail) {
	int state = GROUND;
	String head = "";
	String url_str = "";
	// Sift through the tag for urls
	while(tail.length() > 0 && state != BACKGROUND_FINAL && state != COMMENT_FINAL && state != HREF_FINAL && state != LINK_FINAL && state != SRC_FINAL) {
	    char ch = tail.charAt(0);
	    String sch = "" + ch;
	    sch = sch.toLowerCase();
	    char lch = sch.charAt(0);
	    tail = tail.substring(1);
	    switch (state) {
		// Initial state.
	    case GROUND:
		switch (lch) {
		case '!': state = COMMENT; break;
		case 'B': case 'b': state = B; break;
		case 'H': case 'h': state = H; break;
		case 'L': case 'l': state = L; break;
		case 'S': case 's': state = S; break;
		}
		head = head + ch;
		break;
		// A possible comment
	    case COMMENT:
		switch(lch) {
		case '-': state = COMMENT_DASH; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case COMMENT_DASH:
		switch(lch) {
		case '-': state = COMMENT_FINAL; break; // skip comments
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
		// A possible href
	    case H:
		switch(lch) {
		case 'R': case 'r': state = HR; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case HR:
		switch(lch) {
		case 'E': case 'e': state = HRE; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case HRE:
		switch(lch) {
		case 'F': case 'f': state = HREF; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case HREF:
		switch(lch) {
		case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
		case '=': state = HREF_EQUAL; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case HREF_EQUAL:
		switch(lch) {
		case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
		case '\'': case '\"': state = HREF_Q; break;
		default: state = HREF_NQ; break;
		}
		if(state == HREF_NQ) {
		    url_str = url_str + ch;
		} else {
		    head = head + ch;
		}
		break;
	    case HREF_NQ:
		switch(lch) {
		case ' ': case '\t': case '\n': case '\r':
		    state = HREF_FINAL;
		    tail = ch + tail;
		    break;
		default: url_str = url_str + ch; break;
		}
		break;
	    case HREF_Q:
		switch(lch) {
		case '\'': case '\"':
		    state = HREF_FINAL;
		    tail = ch + tail;
		    break;
		default: url_str = url_str + ch; break;
		}
		break;
		// A possible src
	    case S:
		switch(lch) {
		case 'R': case 'r': state = SR; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case SR:
		switch(lch) {
		case 'C': case 'c': state = SRC; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case SRC:
		switch(lch) {
		case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
		case '=': state = SRC_EQUAL; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case SRC_EQUAL:
		switch(lch) {
		case ' ': case '\t': case '\n': case '\r': break; // Skip spaces
		case '\'': case '\"': state = SRC_Q; break;
		default: state = SRC_NQ; break;
		}
		if(state == SRC_NQ) {
		    url_str = url_str + ch;
		} else {
		    head = head + ch;
		}
		break;
	    case SRC_NQ:
		switch(lch) {
		case ' ': case '\t': case '\n': case '\r':
		    state = SRC_FINAL;
		    tail = ch + tail;
		    break;
		default: url_str = url_str + ch; break;
		}
		break;
	    case SRC_Q:
		switch(lch) {
		case '\'': case '\"':
		    state = SRC_FINAL;
		    tail = ch + tail;
		    break;
		default: url_str = url_str + ch; break;
		}
		break;
		// A possible link-src combo
	    case L:
		switch(lch) {
		case 'I': case 'i': state = LI; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case LI:
		switch(lch) {
		case 'N': case 'n': state = LIN; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case LIN:
		switch(lch) {
		case 'K': case 'k': state = LINK; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case LINK:
		switch(lch) {
		case 'H': case 'h': state = LINK_H; break;
		default: state = LINK; break;
		}
		head = head + ch;
		break;
	    case LINK_H:
		switch(lch) {
		case 'R': case 'r': state = LINK_HR; break;
		default: state = LINK; break;
		}
		head = head + ch;
		break;
	    case LINK_HR:
		switch(lch) {
		case 'E': case 'e': state = LINK_HRE; break;
		default: state = LINK; break;
		}
		head = head + ch;
		break;
	    case LINK_HRE:
		switch(lch) {
		case 'F': case 'f': state = LINK_HREF; break;
		default: state = LINK; break;
		}
		head = head + ch;
		break;
	    case LINK_HREF:
		switch(lch) {
		case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
		case '=': state = LINK_EQUAL; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case LINK_EQUAL:
		switch(lch) {
		case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
		case '\'': case '\"': state = LINK_Q; break;
		default: state = LINK_NQ; break;
		}
		if(state == LINK_NQ) {
		    url_str = url_str + ch;
		} else {
		    head = head + ch;
		}
		break;
	    case LINK_NQ:
		switch(lch) {
		case ' ': case '\t': case '\n': case '\r':
		    state = LINK_FINAL;
		    tail = ch + tail;
		    break;
		default: url_str = url_str + ch; break;
		}
		break;
	    case LINK_Q:
		switch(lch) {
		case '\'': case '\"':
		    state = LINK_FINAL;
		    tail = ch + tail;
		    break;
		default: url_str = url_str + ch; break;
		}
		break;
		// A possible background
	    case B:
		switch(lch) {
		case 'A': case 'a': state = BA; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case BA:
		switch(lch) {
		case 'C': case 'c': state = BAC; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case BAC:
		switch(lch) {
		case 'K': case 'k': state = BACK; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case BACK:
		switch(lch) {
		case 'G': case 'g': state = BACKG; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case BACKG:
		switch(lch) {
		case 'R': case 'r': state = BACKGR; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case BACKGR:
		switch(lch) {
		case 'O': case 'o': state = BACKGRO; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case BACKGRO:
		switch(lch) {
		case 'U': case 'u': state = BACKGROU; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case BACKGROU:
		switch(lch) {
		case 'N': case 'n': state = BACKGROUN; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case BACKGROUN:
		switch(lch) {
		case 'D': case 'd': state = BACKGROUND; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case BACKGROUND:
		switch(lch) {
		case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
		case '=': state = BACKGROUND_EQUAL; break;
		default: state = GROUND; break;
		}
		head = head + ch;
		break;
	    case BACKGROUND_EQUAL:
		switch(lch) {
		case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
		case '\'': case '\"': state = BACKGROUND_Q; break;
		default: state = BACKGROUND_NQ; break;
		}
		if(state == BACKGROUND_NQ) {
		    url_str = url_str + ch;
		} else {
		    head = head + ch;
		}
		break;
	    case BACKGROUND_NQ:
		switch(lch) {
		case ' ': case '\t': case '\n': case '\r':
		    state = BACKGROUND_FINAL;
		    tail = ch + tail;
		    break;
		default: url_str = url_str + ch; break;
		}
		break;
	    case BACKGROUND_Q:
		switch(lch) {
		case '\'': case '\"':
		    state = BACKGROUND_FINAL;
		    tail = ch + tail;
		    break;
		default: url_str = url_str + ch; break;
		}
		break;
	    }
	}
       
        url_str =   url_str.replaceAll("&amp;","&");

	// Greenstone3 adds jsessionIDs at the end of URLs,
	// remove them, as they mess up some things, such as filetype and image identification
	/*
	int jsessionID_index = url_str.toLowerCase().indexOf(";jsessionid=");
	//String jsessionID = "";
	if(jsessionID_index >= 0) {
	    //jsessionID = url_str.substr(jsessionID_index);
	    url_str = url_str.substring(0, jsessionID_index);

	}
	*/

	if(state == HREF_FINAL ) {
	    try {
		if(!url_str.startsWith("javascript:")) {
		    URL new_url = new URL(url, url_str);
		    href_links.addElement(new_url);
		}
	    }
	    catch (Exception e) {
		System.err.println("**** URL: " + url);
		System.err.println("**** url_str: " + url_str);
		System.err.println("**** HEAD: " + head);
		System.err.println("**** TAIL: " + tail);
		e.printStackTrace();
	    }
        }

	if(state == SRC_FINAL ) {
	    try {
		URL new_url = new URL(url, url_str);
		src_links.addElement(new_url);
	    }
	    catch (Exception e) {
		e.printStackTrace();
	    }
        }

	if(state == LINK_FINAL ) {
	    try {
		URL new_url = new URL(url, url_str);
		link_links.add(new_url);
	    }
	    catch (Exception e) {
		e.printStackTrace();
	    }
        }

	if(state == BACKGROUND_FINAL ) {
	    try {
		URL new_url = new URL(url, url_str);
		background_links.add(new_url);
	    }
	    catch (Exception e) {
		e.printStackTrace();
	    }
        }

	return head + url + tail;
    }
    
    static private String guessContentType(String text) {

	int jsessionID_index = text.toLowerCase().indexOf(";jsessionid=");
	if(jsessionID_index >= 0) {
	    text = text.substring(0, jsessionID_index);
	}
	
	if(text.endsWith("/")) {
	    return "text/html";
	} else if (text.endsWith(".html")) {
	    return "text/html";
	} else if (text.endsWith(".htm")) {
	    return "text/html";
	} else if (text.indexOf("?")>0) {
	    return "text/html";
	} else {
	    int slash = text.lastIndexOf("/");
	    slash += 1;
	    String urlStr = text.substring(slash);
	    if(urlStr.indexOf(".") == -1) { // if no filetype specified, assume HTML?
		return "text/html";
	    } else if (urlStr.indexOf(".jpg") != -1 || urlStr.indexOf(".jpeg") != -1) {
		return "image/jpeg";
	    }
	}
	return "image/jpeg";
    }
    
}
