/*
 *
 * BEGIN HEADER
 *
 * ---- 
 *
 * $ID: PantherProxy.java,v 1.6 2001/06/12 20:58:43 burton Exp $
 * $Project: http://panther.openprivacy.org $
 * $CVSROOT: :pserver:anoncvs@sierra.openprivacy.org:/usr/local/cvs/public $
 * $WebCVS: http://www.openprivacy.org/cgi-bin/cvsweb/cvsweb.cgi/panther/ $
 * $Mailing-List: http://www.openprivacy.org/lists/ $
 * $Bugzilla: http://bugzilla.openprivacy.org/ $
 * Copyright 2001 OpenPrivacy.org.  All rights reserved.
 *
 * ---- 
 *
 * Copyright 2001 OpenPrivacy.org.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the LICENSE which you should have received with this package. 
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.
 *
 * END HEADER
 * 
 */

/*

TODO:

This is broken.

    http://www.smokedot.org/?op=displaystory;sid=2002/9/25/203647/918

    I have NO idea what is wrong with this URL syntax.  Is it not supported byte java.net.URL

    The images are cached but the URL isn't fetched.  Maybe because I am not
    following a redirect?
    
*/

package org.openprivacy.reptile;

import java.io.*;
import java.util.*;

import org.apache.regexp.*;

import talon.util.*;

/**
 * Responsible for accepting an input string, finding where it isn't well formed
 * and correcting the problem so that we can return a valid well formed XML
 * fragment.
 * 
 * @author <a href="mailto:burton@relativity.yi.org">burtonator</a>
 * @version $Id: WellFormedContentParser.java,v 1.3 2002/10/16 01:16:57 burton Exp $
 */
public class WellFormedContentParser {

    /**
     * Accept broken HTML and output valid XML.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public static String parse( String input ) throws Exception {

        StringBuffer output = new StringBuffer();

        //String str_regexp = "(" + "(<!--)" + "|" + "(</?([^/> ]+)([^/>])*([ ]?/?>)?)" + ")";

        //String str_regexp = "<!--" + "|" + "</?([^/> ]+)([^/>])*([ ]?/?>)?";

        String str_regexp = "<!--" + "|" + "</?([^/> ]+)([^>])*([ ]?/?>)?";
        
        RE regexp = new RE( str_regexp );

        int index = 0;

        String local_name = "";

        Stack stack = new Stack();
        
        while ( regexp.match( input, index ) ) {

            //see if we are on a comment go to the end of the comment.
            if ( isCommentBegin( regexp.getParen( 0 ) ) ) {

                //goto the end of the comment
                RE comment_end = new RE( "--[ ]?/?>" );

                if ( comment_end.match( input, index ) ) {

                    output.append( input.substring( index, comment_end.getParenEnd( 0 ) ) );

                    index = comment_end.getParenEnd( 0 );
                    continue;
                    
                } else {

                    output.append( input.substring( index, input.length() ) );

                    index = input.length();
                    continue;

                }

            }
            
            local_name = regexp.getParen( 1 ).toLowerCase();

            String match = regexp.getParen( 0 );
            
            if ( isNewElement( match ) ) {

                if ( requiresSeltTermination( local_name ) ) {

                    output.append( input.substring( index, regexp.getParenEnd( 0 ) -1 ) );
                    output.append( "/>" );

                } else {

                    stack.push( local_name );

                    //FIXME: make sure to have a requiresSeltTermination for img, br, hr, etc.
                
                    //add data including the new element
                    output.append( input.substring( index, regexp.getParenEnd( 0 ) ) );

                }

            } else if ( isElementEnd( match ) ) {

                //FIXME: make sure the end is the same as what is in the
                //stack.. if it isn't then we should pop an entry out of the
                //stack and then add it to the data.

                if ( stack.empty() == false && 
                     local_name.equals( stack.peek() ) == false ) {

                    //output.append( "\n" );
                    output.append( "</" + stack.peek() + ">" );
                    //output.append( "\n" );
                    
                    output.append( input.substring( index, regexp.getParenStart( 0 ) ) );
                    
                } else {

                    output.append( input.substring( index, regexp.getParenEnd( 0 ) ) );
                    
                }

                //now remove the tag
                if ( stack.empty() == false ) {
                    stack.pop();
                } 

            }

            index = regexp.getParenEnd( 0 );
            
        }

        //add the remaining unmatched data
        output.append( input.substring( index, input.length() ) );
        
        //FIXME: now add whatever is in the stack to the end of the output...

        while ( stack.empty() == false ) {

            String next_element = (String)stack.pop();

            output.append( "</" + next_element + ">" );
            
        } 

        return output.toString();
        
    }

    /**
     * Return true if this match is the end of an element body.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private static boolean isNewElement( String match ) {

        return isElementEnd( match ) == false && isSelfTerminating( match ) == false;

    }
    
    /**
     * Return true if this match is the end of an element body.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private static boolean isElementEnd( String match ) {

        return match.startsWith( "</" );

    }

    /**
     * Return true if this match is self terminating <img/>
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private static boolean isSelfTerminating( String match ) {

        return match.endsWith( "/>" );

    }

    /**
     * Return true if this is an element that should be self terminating like <br/>
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private static boolean requiresSeltTermination( String name ) {

        return name.equals( "img" ) ||
               name.equals( "br" ) ||
               name.equals( "link" ) ||
               name.equals( "meta" ) ||
               name.equals( "hr" );
        
    }

    /**
     * 
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private static boolean isCommentBegin( String match ) {

        return match.equals( "<!--" );
        
    }

    public static void main( String[] args ) {

        try { 
            
            System.out.println( parse( InputStreamUtils.toString( new FileInputStream( args[ 0 ] ) ) ) );
            
        } catch ( Throwable t ) {
            
            t.printStackTrace();
            
        }
        
    }

}
