/*
 *
 * BEGIN HEADER
 *
 * ---- 
 *
 * $ID: PantherProxy.java,v 1.6 2001/06/12 20:58:43 burton Exp $
 * $Project: http://panther.openprivacy.org $
 * $CVSROOT: :pserver:anoncvs@sierra.openprivacy.org:/usr/local/cvs/public $
 * $WebCVS: http://www.openprivacy.org/cgi-bin/cvsweb/cvsweb.cgi/panther/ $
 * $Mailing-List: http://www.openprivacy.org/lists/ $
 * $Bugzilla: http://bugzilla.openprivacy.org/ $
 * Copyright 2001 OpenPrivacy.org.  All rights reserved.
 *
 * ---- 
 *
 * Copyright 2001 OpenPrivacy.org.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the LICENSE which you should have received with this package. 
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.
 *
 * END HEADER
 * 
 */

package org.openprivacy.reptile;

import java.io.*;
import java.util.*;

import org.apache.regexp.*;

import talon.util.*;

/*

TODO:

 */

/**
 * Given some input HTML, the resource URL of the HTML, we will take all <a>,
 * <img>, etc links and make them full URLs (if they are partial)
 * 
 * @author <a href="mailto:burton@relativity.yi.org">burtonator</a>
 * @version $Id: RelativizeContentParser.java,v 1.5 2002/10/16 01:16:57 burton Exp $
 */
public class RelativizeContentParser {
    
    private String resource = null;

    private Vector dependencies = new Vector();
    
    /**
     * 
     * Get the value of <code>resource</code>.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String getResource() { 
        
        return this.resource;
        
    }

    /**
     * 
     * Set the value of <code>resource</code>.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public void setResource( String resource ) { 
        
        this.resource = resource;
        
    }

    /**
     * 
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public RelativizeDependency[] getDependencies() {

        RelativizeDependency[] depends = new RelativizeDependency[ dependencies.size() ];
        dependencies.copyInto( depends );

        return depends;
        
    }

    /**
     * Used to fix relative links in HTML content so that everything is
     * expanded.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String relativize( String content ) throws Exception {

        //FIXME: if the URL contains a scheme, don't include it.
        //
        // Example:  http://www.cnn.com/javascript:LaunchVideo('/popup/section_framesets/showbiz.frameset.','300k')
        
        String base = this.getBase();

        String site = this.getSite();

        StringBuffer buff = new StringBuffer();
        
        int index = 0;
        
        RE elementr = new RE( "<(img|a|script|link)[^>]*(src|href)=[\"']([^\"']+)[\"'][^>]*/?>",
                             RE.MATCH_CASEINDEPENDENT | RE.MATCH_MULTILINE );

        String local_name;
        int bound;
        
        while ( elementr.match( content, index ) && index != -1 ) {

            local_name = elementr.getParen( 1 ).toLowerCase();

            bound = elementr.getParenEnd( 0 );

            String slink = elementr.getParen( 3 );
            
            //add this to the buffer.

            buff.append( content.substring( index, elementr.getParenStart( 3 ) ) );

            slink = expand( slink );

            buff.append( slink );

            //add this as a dependency
            if ( local_name.equals( "img" ) ) {

                ImageRelativizeDependency dep = new ImageRelativizeDependency();
                dep.resource = slink;
                this.addDependency( dep );
                
            } else if ( local_name.equals( "a" ) ) {

                DocumentRelativizeDependency dep = new DocumentRelativizeDependency();
                dep.resource = slink;
                this.addDependency( dep );

            } else if ( local_name.equals( "script" ) ) {

                ScriptRelativizeDependency dep = new ScriptRelativizeDependency();
                dep.resource = slink;
                this.addDependency( dep );

            } else if ( local_name.equals( "link" ) ) {

                LinkRelativizeDependency dep = new LinkRelativizeDependency();
                dep.resource = slink;
                this.addDependency( dep );

            }

            //else complete this link

            buff.append( content.substring( elementr.getParenEnd( 3 ),
                                            elementr.getParenEnd( 0 ) ) );
            
            index = elementr.getParenEnd( 0 );

        }

        //add the rest of the content
        buff.append( content.substring( index, content.length() ) );
        
        return buff.toString();

    }

    /**
     * Expand a link relavant to the current site.  This takes care of links
     * such as
     *
     * /foo.html -> http://site.com/base/foo.html
     *
     * foo.html -> http://site.com/base/foo.html
     *
     * Links should *always* be expanded before they are used.
     *
     * Note that all resource URLs will have correct trailing slashes.  If the URL
     * does not end with / then it is a file URL and not a directory.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String expand( String link ) throws Exception {

        //make sure we can use this.
        if ( isInvalidScheme( link ) ) {
            return link;
        } 

        //    From: http://www.w3.org/Addressing/rfc1808.txt
        //        
        //    If the parse string begins with a double-slash "//", then the
        //    substring of characters after the double-slash and up to, but not
        //    including, the next slash "/" character is the network
        //    location/login (<net_loc>) of the URL.  If no trailing slash "/"
        //    is present, the entire remaining parse string is assigned to
        //    <net_loc>.  The double- slash and <net_loc> are removed from the
        //    parse string before
        
        if ( link.startsWith( "//" ) ) {

            return "http:" + link;
            
        } 

        //keep going
        
        if ( link.startsWith( "/" ) ) {
            
            link = this.getSite() + link;

        } else if ( link.startsWith( "#" ) ) {

            link = this.resource + link;
             
        } else if ( link.startsWith( ".." ) ) {
            
            //ok.  We need to get rid of these .. directories.

            RE regexp = new RE ( "^\\.\\./?(.*)$", RE.MATCH_CASEINDEPENDENT );

            RE baseRegexp = new RE ( "^(.*/)[^/]+/$", RE.MATCH_CASEINDEPENDENT );

            String base = getBase() + "/";
            
            while ( regexp.match( link ) ) {
                
                //get rid of the first previous dir in the link
                link = regexp.getParen( 1 );
                
                //get rid of the last directory in the resource

                if ( baseRegexp.match( base ) ) {

                    base = baseRegexp.getParen( 1 );

                } 

            }

            link = base + link;

        } else if ( link.startsWith( "http://" ) == false ) {

            String base = this.getBase();

            link = base + "/" + link;

        } 

        return link;
        
    }

    /**
     * Get the base of this URL.  For example if we are given:
     *
     * http://www.foo.com/directory/index.html
     *
     * we will return
     *
     * http://www.foo.com/directory
     *
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String getBase() {

        int begin = "http://".length() + 1;
        
        int end = resource.lastIndexOf( "/" );
        
        if ( end == -1 || end <= begin ) {
            
            //probaby a URL like http://www.cnn.com
            
            end = resource.length();
            
        } 

        return resource.substring( 0, end );
        
    } 

    /**
     * Get the site for this resource.  For example:
     *
     * http://www.foo.com/directory/index.html
     *
     * we will return
     *
     * http://www.foo.com
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String getSite() {

        int end = resource.indexOf( "/", 8 );

        if ( end == -1 ) {

            end = resource.length();

        } 

        //start at 8 which is the width of http://
        return resource.substring( 0, end );

    }

    /**
     * Add the given resource as a dependency of the current HTML file.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public void addDependency( RelativizeDependency depend ) {

        if ( this.dependencies.contains( depend ) == false ) {
            
            this.dependencies.addElement( depend );

        } 
        
    }

    /**
     * Return true if this is an invalid scheme and should be expanded
     * (javascript, mailto, etc)
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public boolean isInvalidScheme( String resource ) {

        return resource.startsWith( "javascript:" ) || resource.startsWith( "mailto:" );

    }
    
}
