package ISSearch;

import java.io.*;
import java.net.*;
import java.util.*;
import java.lang.*;
import java.sql.*;
import oracle.jdbc.*;
import oracle.sql.*;

class QueueElement
{
    java.net.URL url;
    int crawl_level;
}

/**
 * The Crawler class of the Web search engine.
 * This class is used to start and stop the Crawler,
 * to reset the engine and to control crawling parameters.
 * @see Runnable
 * @see Thread
 * @see InetAddress
 * @see URL
 * @see HttpURLConnection
 * @see InputStreamReader
 * @see BufferedReader
 * @see Exception
 */
public class ISCrawler implements ISCrawlerInterface
{
    static MainGUI m = new MainGUI();
    
   //parts of the code taken from http://java.sun.com/developer/technicalArticles/ThirdParty/WebCrawler/WebCrawler.java
   static final String DISALLOW = "Disallow:";
   // initializing MAX_CRAWL_DEPTH with 1; can be changed by using setCrawlingDepth(int depth)
   static int MAX_CRAWL_DEPTH = 3;
   // initializing MAX_QUEUE_SIZE with 500; can be changed by using setQueueMaxSize(int m)
   static int MAX_QUEUE_SIZE = 500;
   // initializing TIMEOUT with 2000; can be changed by using setTimeout(int t)
   static int TIMEOUT = 2000; //timeout in ms
   static ISDocumentInterface CURRENT_DOCUMENT = null;
   int crawl_level;
   static int crawler_state = STOPPED;

   ISDB  isdb = new ISDB();
   
   // URLs which should be searched
   Vector vectorToSearch = new Vector();
   // URLs already searched
   Vector vectorSearched = new Vector();
   
   /** Creates a new instance of ISCrawler */
   public ISCrawler()
   {
   }
   
   /**
    * Adds a new link to the URL queue, if the link is not yet visited.
    * @param link The URL link representation of the new target
    */
   public void addLink(java.net.URL link)
   {
       if (crawl_level + 1 > MAX_CRAWL_DEPTH) return;
       QueueElement qelem = new QueueElement();
       qelem.url = link;
       qelem.crawl_level = crawl_level;
       if (!(isVisited(link)) && (crawl_level <= MAX_CRAWL_DEPTH)) {
           vectorToSearch.addElement(qelem);
       }
   }
   
   /** Returns the best candidate to be visited next.
    *  The result must have the highest priority
    * (in the sense of the selected ordering strategy) under
    * all available links.
    * @return The best target to be visited by the Crawler next, <code>null</code> if the queue is empty.
    */
   public java.net.URL getBest()
   {
      return getNextURL();
   }
   
   /** Returns the current maximum allowed crawling depth.
    * @return The current allowed craling depth.
    */
   public int getCrawlingDepth()
   {
      return MAX_CRAWL_DEPTH;
   }
   
   /**
    * Returns the last document visited by the Crawler.
    * @return The last visited document as object that implements ISDocumentInterface (and contains all
    * extracted links, words and their stems); null if no documents were crawled yet.
    */
   public ISDocumentInterface getCurrentDocument()
   {
      return CURRENT_DOCUMENT;
   }
   
   /**
    * Sets the last document visited by the Crawler.
    * @param The last visited document as object that implements ISDocumentInterface (and contains all
    * extracted links, words and their stems).
    */
   public void setCurrentDocument(ISDocumentInterface isd)
   {
       CURRENT_DOCUMENT = isd;
   }
   
   /**
    * Returns the last URL visited by the Crawler.
    * @return The last visited URL; null if no links were crawled yet.
    */
   public java.net.URL getCurrentURL()
   {
      return (URL)vectorSearched.lastElement();
   }
   
   /** Returns the maximum allowed size of the URL Queue
    * @return The max allowed Queue size
    */
   public int getMaxQueueSize()
   {
      return MAX_QUEUE_SIZE;
   }
   
   /** Returns the current size of the URL queue
    * @return The current size of the URL queue.
    */
   public int getQueueSize()
   {
       return vectorToSearch.size();
   }
   
   /**
    * Returns the current state of the crawler.
    * Possible states are <code>RUNNING</code> and <code>STOPPED</code>.
    * @return The current state of the crawler, <code>RUNNING</code> oder <code>STOPPED</code>
    */
   public int getState()
   {
      return ISCrawler.crawler_state;
   }
   
   /**
    * Sets the current state of the crawler.
    * Possible states are <code>RUNNING</code> and <code>STOPPED</code>.
    * @param The current state of the crawler, <code>RUNNING</code> oder <code>STOPPED</code>
    */
   public void setState(int state_code)
   {
       ISCrawler.crawler_state = state_code;
   }
   
   /**
    * Returns the current timeout of the crawler.
    * @return The current timeout of the crawler.
    */
   public int getTimeout()
   {
      return TIMEOUT;
   }
   
   /**
    * Sets the current timeout of the crawler.
    * @param t The current timeout of the crawler in ms.
    */
   public void setTimeout(int t)
   {
       TIMEOUT = t;
   }
   
   /**
    * Checks if the URL of the given document is already visited by the crawler.
    * @return <code>true</code> if the engine was able to recognize
    * the given URL as already visited, <code>false</code>.
    */
   public boolean isVisited(java.net.URL doc)
   {
      for (int i=0; i<vectorSearched.size(); i++)
      {
          if (vectorSearched.elementAt(i).equals(doc)) return true;
      }
      return false;
   }
   
   /** Sets the maximum allowed crawling depth.
    * @param depth The maximum allowed craling depth.
    */
   public void setCrawlingDepth(int depth)
   {
      MAX_CRAWL_DEPTH = depth;
   }
   
   /** Set the maximum allowed size of the URL queue
    * @param m The maximum allowed Queue size
    */
   public void setQueueMaxSize(int m)
   {
       MAX_QUEUE_SIZE = m;
   }
   
   /**
    * Starts the thread of the crawler and changes the engine state to <code>RUNNING</code>
    */
   public void start()
   {
       Thread crawl = new Thread(this);
       
       System.out.println("Start");
       setState(RUNNING);
       crawl.start();
   }
   
   /**
    * Stops the crawler.
    * This method stops crawling and sets the engine status to <code>STOPPED</code>.
    */
   public void stop()
   {
       System.out.println("Stop");
       setState(STOPPED);
   }
   
   /**
    * Resets the crawler.
    * This method stops the crawling,
    * resets the URL queue, and the list of visited links.
    * Finally, it sets the crawler status to <code>STOPPED</code>,
    */
   public void reset()
   {
       vectorSearched.removeAllElements();
       vectorToSearch.removeAllElements();
       System.out.println("Reset");
       setState(STOPPED);
   }
   
   /* This method is prescribed by the <code>Runnable</code> Interface.
    * The call <code>new Thread(..).start()</code> starts this method
    * in a new thread.
    */
   public void run()
   {
       ISDocumentInterface isd = new ISDocument();
       ISDBCrawler dbcrawler = new ISDBCrawler();
       boolean exception = false;
       
       m.addInformation("Opening connection to database...");
       
       boolean dbopen_success = dbcrawler.openDB();
       
       isdb = dbcrawler.isdb;
             
       if (getState() == RUNNING) {
           m.addInformation("(started)");
       }
       
       if (!dbopen_success)
       {
           System.err.println("openDB failed!");
           m.addInformation("openDB failed!");
       } else {
           m.addInformation("Connection established.");
       }
             
       while ((!isDataStructureEmpty()) && (getState() == RUNNING) && (crawl_level <= MAX_CRAWL_DEPTH) && dbopen_success){
           // getting next URL for the crawler
           URL url = null;
           exception = false;
           if (getState() == RUNNING) url = getNextURL();
           if (((url != null)) && (robotSafe(url)) && (("http").equals(url.getProtocol()))) { 
               try {
                       URLConnection urlConnection = url.openConnection(); 
                       urlConnection.setConnectTimeout(TIMEOUT);
                       m.addInformation("CONNECTION OPENED: " + url.toString());
                       
                       //opening the url stream...
                       InputStream urlstream = url.openStream();
                       //...and preparing it for the parser by converting into a InputStreamReader
                       InputStreamReader urlstreamreader = new InputStreamReader(urlstream);
                               
                       String type = getContentType(urlConnection);
                       if (type == null) {
                           String tempContentType = null;
                           tempContentType = urlConnection.guessContentTypeFromStream(urlstream);
                           
                           if (tempContentType == null) {
                               type = "text/html";
                           } else {
                               type = tempContentType;
                           }
                       }
                       m.addInformation("CONTENT GOT: " + type);
                       
                       //we only have a look at documents which are text/html or text/plain
                       if (((type == null) || (("text/html").equals(type)) || (("text/plain").equals(type))) && getState() == RUNNING)
                       {
                               m.addInformation("PARSING DOC: " + url.toString());
                               isd = runParser(urlstreamreader);
                               isd.setLink(url.toString());
                               setCurrentDocument(isd);
                               m.addInformation("PARSING FINISHED");
                               
                               if (crawl_level < MAX_CRAWL_DEPTH) {
                                   for(int i=0; i<isd.getUrls().length; i++) {
                                       crawl_level++;
                                       if (getQueueSize() < getMaxQueueSize()) {
                                           addLink(isd.getUrls()[i]);
                                           m.addInformation("ADDING URL " + isd.getUrls()[i].toString() + " TO QUEUE...");
                                           m.updateProgressBar(i+1, isd.getUrls().length);
                                       }
                                       crawl_level--;
                                   }
                               }

                               m.updateProgressBar(0,0);
                               dbcrawler.store(url, isd);

                               urlstream.close();
                               urlstreamreader.close();
                       }
                } catch (IOException e) {
                    exception = true;
                } catch (NullPointerException e) {
                    exception = true;
                }
           } else { System.err.println("unsupported site"); }
       }
       
       //creating features
       if((!(exception)) && m.getAutomaticCreation()) {
           isdb.createFeatures();
       }
       
       if(isDataStructureEmpty()) {
           m.setInformationLabelText("(done)");
           m.setCrawlerStateLabelText("---");
           m.enableItems(true);
           m.enableItems(true);
           setState(STOPPED);
       }
       if (getState() == STOPPED) {
//           System.out.println("Crawler says 'Bye Bye' ;-)");
           m.addInformation("(stopped)");
           m.enableItems(true);
           m.enableItems(true);
       }
       
       m.addInformation("Closing connection to database...");
       dbcrawler.closeDB();
   }

   
   /** Returns the next URL to be searched. It is doing the job for <code>getBest()</code>.
    * @return next URL in the queue.
    */
   public java.net.URL getNextURL()
   {
       QueueElement qelem = (QueueElement)vectorToSearch.elementAt(0);
       // you don't have to check here, if crawl_level >= MAX_CRAWL_DEPTH, but it is a little bit more secure
       if (crawl_level >= MAX_CRAWL_DEPTH) return null;
       if (qelem.crawl_level > crawl_level) {
           crawl_level++;
           return getNextURL();
       }
       m.addInformation("GETTING NEXT URL...");
       URL temp = qelem.url;
       vectorSearched.addElement(temp);
       vectorToSearch.removeElementAt(0);
       m.addInformation("NEXT URL GOT: " + temp.toString());
       return temp;
   }

   
   /** Checks if our data structure is empty or not.
    * @return boolean value which is true if our data structure is empty
    */
   public boolean isDataStructureEmpty()
   {
       if (!(vectorToSearch.isEmpty())) return false;
       return true;
   }
  
   /** Starts the parser.
    * @param Reader which contains the URL to be parsed.
    * @return <code>ISDocumentInterface</code>.
    */
   public ISDocumentInterface runParser(Reader r)
   {
       ISParser p = new ISParser();
       ISDocumentInterface isd = new ISDocument();
       isd = p.parse(r);
       return isd;
   }

   
   /** Checks if there exists a robots.txt on the server and checks it contains a "Disallow:".
    * @param url URL which should be checked.
    * @return true if the document can be parsed.
    */
   public boolean robotSafe(java.net.URL url) {
	String strHost = url.getHost();
        System.out.println(strHost);
	// form URL of the robots.txt file
	String strRobot = "http://" + strHost + "/robots.txt";
	URL urlRobot;
	try { 
	    urlRobot = new URL(strRobot);
	} catch (MalformedURLException e) {
	    // something weird is happening, so don't trust it
	    return false;
	}

	String strCommands;
	try {
	    InputStream urlRobotStream = urlRobot.openStream();

	    // read in entire file
	    byte b[] = new byte[1000];
	    int numRead = urlRobotStream.read(b);
	    strCommands = new String(b, 0, numRead);
	    while (numRead != -1) {
		numRead = urlRobotStream.read(b);
		if (numRead != -1) {
		    String newCommands = new String(b, 0, numRead);
		    strCommands += newCommands;
		}
	    }
	    urlRobotStream.close();
	} catch (IOException e) {
	    // if there is no robots.txt file, it is OK to search
	    return true;
	} catch (IllegalArgumentException e) {
            // don't trust it...
            return false;
        } catch (StringIndexOutOfBoundsException e) {
            // don't trust it...
            return false;
        }

	// assume that this robots.txt refers to us and 
	// search for "Disallow:" commands.
	String strURL = url.getFile();
	int index = 0;
	while ((index = strCommands.indexOf(DISALLOW, index)) != -1) {
	    index += DISALLOW.length();
	    String strPath = strCommands.substring(index);
	    StringTokenizer st = new StringTokenizer(strPath);

	    if (!st.hasMoreTokens())
		break;
	    
	    String strBadPath = st.nextToken();

	    // if the URL starts with a disallowed path, it is not safe
	    if (strURL.indexOf(strBadPath) == 0)
		return false;
	}

	return true;
    }

   
   /**
    * Returns the ContentType of the current document.
    * @param urlConnection The current document given by a URLConnection.
    * @return The ContentType of the current document as String.
    */
   public String getContentType(URLConnection urlConnection) {
       String contentType = urlConnection.getContentType();
       StringTokenizer s = new StringTokenizer(contentType, " \t\n\r\f,.;:?!", false);
       return (s.nextElement().toString());
    }

     
    public static void main(String[] args)
    {
/*        try {
            URL url = new URL("http://www.mpi-sb.mpg.de/units/ag5/teaching/ss05/is05/links.htm");
            ISCrawler c = new ISCrawler();
            c.addLink(url);
            c.setCrawlingDepth(2);
            c.start();
        } catch (MalformedURLException murle) {
            //nix
        }*/
    }
}