package ISSearch;

import java.io.*;
import javax.swing.text.html.*;

import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;
import javax.swing.text.html.HTMLEditorKit.*;
import java.util.*;
import java.net.*;

/**
 * This <B>mandatory</B> class must implement all functions prescribed by <code>ISParserInterface</code>
 * 
 */
public class ISParser implements ISParserInterface
{
   
   /** Creates a new instance of ISParser */
   public ISParser()
   {
   }
   
  /** Decides whether the given token is claimed as stopword or not. This function must apply 
   * the FreeWAIS stopword list. This function must be implementen case-insensitive (e.g., 
   * both tokens 'the' and 'ThE' should be properly recognized as stopwords)
   * @param who The String to be checked.
   * @return true if the given string is a stopword, false otherwise.
   */
   public boolean isStopword(String who)
   {
       //reading the STOPWORDS file
       BufferedReader f;
       String s;
       Vector v = new Vector();
       
       try {
           f = new BufferedReader(new FileReader("STOPWORDS"));
           while ((s = f.readLine()) != null) {
               v.addElement(s.toLowerCase());
           }
           f.close();
       } catch (IOException e) {
           System.out.println("Fehler beim ffnen der Datei!");
           System.err.println(e.toString());
           return false;
       }
       //end of reading STOPWORDS file
       
       //checking whether String who is in the STOPWORD list or not
       for (int i=0; i<v.size(); i++) {
           if (who.equals(v.elementAt(i))) {
               //yes, the String who is in the STOPWORD list
               return true;
           }
       }
      return false;
   }
   
    /** Performs the input analysis. Returns the container object that implements the <I>ISDocumentInterface</I> and contains extracted 
   * words, word stems, and links.
   * @param input the input of the parser (e.g., text file or HTTP connection), represented by the <code>Reader</code>
   * @return Container object with terms and links or <I>null</I> if any internal error occurs.
   */
   public ISDocumentInterface parse(Reader input)
   {
       //extract URLs and text
        final Vector v = new Vector();
        final Vector wordlist = new Vector();
        final ISParser checkword = new ISParser();
        try {
            HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback () {   
                //extracting text from the current document
                public void handleText(char[] data, int pos) {
                    String w = new String(data);
                    //creating a StringTokenizer for the input data
                    StringTokenizer token = new StringTokenizer(w.toString(), " \t\n\r\f,.;:?!", false);
                    while (token.hasMoreTokens()) {
                        //temp is the next word in the token
                        String temp = token.nextElement().toString();
                        //if the current word is not a stopword, add the ISTermInterface word to the wordlist (Vector<ISTermInterface>)
                        //if the length of the trimmed string temp is 0, temp is currently a deliminator which should not
                        //be added to the wordlist
                        if (!(checkword.isStopword(temp))) {
                            ISTermInterface word = new ISTerm();
                            word.setWord(temp);
                            word.setPos(wordlist.size() + 1);
                            wordlist.addElement(word);
                        }
                    }
                }
                //extracting URLs from links in the current document
                public void handleStartTag(HTML.Tag tag, MutableAttributeSet attrSet, int pos) {
                    if (tag == HTML.Tag.A) {
                        try {
                            URL url_temp = new URL((String)attrSet.getAttribute(HTML.Attribute.HREF));
                            v.addElement(url_temp);
                        } catch (MalformedURLException murle) {
                            return;
                        }
                    }
                }
            };
            new ParserDelegator().parse(input, callback, false);           
        } catch (IOException e) {
            return null;
        }
        URL[] url = new URL[v.size()];
        v.toArray(url); 
        //returns set of URLs

        ISTermInterface[] words = new ISTermInterface[wordlist.size()];
        wordlist.toArray(words); 
        //returns set of words
               
        //creating the object ISDocument
        ISDocumentInterface d = new ISDocument();
        d.setUrls(url);
        d.setTerms(words);
        d.setLink(""); //hier kann derzeit noch nichts sinnvolles gesetzt werden
       return d;
   }
   
    /** Applies the Porter stemming algorithm and returns the resulting word stem. 
   * The output must be normalized (using String.toLowerCase() and String.trim())
   * @param who The word to be stemmed.
   * @return word stem, trimmed and lowercase.
   */
   public String stem(String who)
   {
       try {
           Stemmer s = new Stemmer();

           //converting the String who to a char[]
           char temp_char[] = new char[who.length()];
           String temp;
           for (int i=0; i<who.length(); i++) {
               temp_char[i] = who.charAt(i);
           }
           
           //adding the char[] to the Stemmer
           s.add(temp_char,who.length());
           //apply Stemmer
           s.stem();
           
           //converting the result of the Stemmer to a String, then normalizing by using
           //String.toLowerCase() and String.trim()
           temp = s.toString();
           temp = temp.toLowerCase();
           temp = temp.trim();
           
           //return the word after using String.toLowerCase() and String.trim();
           return temp;
       } catch (Exception e) {
           System.err.println(e.toString());
           return null;
       }
   }
   
   public static void main(String[] args) {
       //for testing purposes only!!!
/*       try {
       ISParser p = new ISParser();
       System.out.println("'car' is stopword: " + p.isStopword("car")); //"car" is not in STOPWORDS
       System.out.println("'actually' is stopword: " + p.isStopword("actually")); //"actually" is in STOPWORDS
       System.out.println("stem of 'actually': " + p.stem("actually")); //stem "actually"
       System.out.println("stem of 'available': " + p.stem("available")); //stem "available"
       System.out.println("current user directory: " + System.getProperty("user.dir")); //getting the current working directory
       System.out.println("---");
       
       Reader in = new BufferedReader(new FileReader("test.html"));
       
       ISDocumentInterface isd = p.parse(in);
       URL[] urls = isd.getUrls();
      
       for (int i=0; i<urls.length; i++) {
           System.out.println(urls[i]);
       }
      
       System.out.println("---");

       ISTermInterface[] wordlist = isd.getTerms();

       for (int i=0; i<wordlist.length; i++) {
           System.out.println(wordlist[i].getWord());
           System.out.println(wordlist[i].getStem());
           System.out.println(wordlist[i].getPos());
       };

       isd.setLink("http://www.google.de/");
       System.out.println(isd.getLink());
    
       } catch (FileNotFoundException e) {
           System.err.println(e.toString());
       } catch (IOException e) {
           System.err.println(e.toString());
       }*/
   }
}