/*
 *
 * BEGIN HEADER
 *
 * ---- 
 *
 * $ID: PantherProxy.java,v 1.6 2001/06/12 20:58:43 burton Exp $
 * $Project: http://panther.openprivacy.org $
 * $CVSROOT: :pserver:anoncvs@sierra.openprivacy.org:/usr/local/cvs/public $
 * $WebCVS: http://www.openprivacy.org/cgi-bin/cvsweb/cvsweb.cgi/panther/ $
 * $Mailing-List: http://www.openprivacy.org/lists/ $
 * $Bugzilla: http://bugzilla.openprivacy.org/ $
 * Copyright 2001 OpenPrivacy.org.  All rights reserved.
 *
 * ---- 
 *
 * Copyright 2001 OpenPrivacy.org.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the LICENSE which you should have received with this package. 
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.
 *
 * END HEADER
 * 
 */

package org.openprivacy.reptile;

import java.io.*;
import java.net.*;
import java.util.*;

import javax.servlet.*;
import javax.servlet.http.*;

import org.openprivacy.reptile.actions.*;
import org.openprivacy.reptile.extensions.*;
import org.openprivacy.reptile.init.*;
import org.openprivacy.reptile.tasks.*;
import org.openprivacy.reptile.util.*;
import org.openprivacy.reptile.xslt.*;

import talon.*;
import talon.components.*;
import talon.resources.*;
import talon.util.*;
import talon.util.net.*;

import org.apache.regexp.*;

import org.jdom.*;
import org.jdom.output.*;

/**
 *
 * An HTML -> RSS serializer that takes a given HTML file and filters out all
 * URLs that begin with a date such as /2002.  Only these URLs are considered.
 *
 * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
 */
public class RSSWebsiteFilterSerializer {

    private static final boolean DEBUG = true;

    private static final int MAX_ITEMS = 15;

    private RSSContentSerializer rcs = null;

    private String[] links = new String[0];

    private String html = null;

    //The base of the website we have to match against...  by default we resort
    //to using the current year.. as the base
    private String base = "/20[0-1][0-9]";
    
    /**
     * 
     * Create a new <code>RSSWebsiteFilterSerializer</code> instance.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public RSSWebsiteFilterSerializer( String resource ) {

        this.rcs = new RSSContentSerializer();

        this.rcs.setResource( resource );

    }

    /**
     * Parse this channel.  This should be called before any other methods that
     * return any data.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public void parse() throws Exception {

        this.rcs.parse();

        //now go through each paragraph finding URLs.

        //now get the content
        this.html = this.rcs.cleanseHTML( this.rcs.getResourceAsString() );
        this.html = this.rcs.relativize( this.html );
        
        //ok... now find links under this channel.

        this.links = getAnchors( html );

        if ( base.indexOf( "$" ) == -1 ) {

            base = "^" + base;
        }

        RE regexp = new RE( base );

        Vector results = new Vector();

        //now see which anchors we can use.
        for ( int i = 0; i < this.links.length; ++i ) {

            try {

                String link = this.links[i];
                
                String path = new URL( link ).getPath();

                //System.err.println( "\"" + link + "\"");

                if ( regexp.match( path ) && results.contains( link ) == false ) {

                    //System.err.println( "ACCEPTED: \"" + link + "\"");

                    results.addElement( link );
                    
                } 

            } catch ( MalformedURLException e ) { }

        } 

        this.links = new String[ results.size() ];
        results.copyInto( this.links );

    }

    /**
     * Get all links for this content.  Links are denoted as all <a href="">
     * links
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String[] getAnchors( String html ) throws Exception {

        int index = 0;

        //FIXME: what if href isn't the first attribute?  It will fail here...
        RE regexp = new RE( "<a href=\"([^\"]+)\"", RE.MATCH_CASEINDEPENDENT | RE.MATCH_MULTILINE );

        Vector v = new Vector();

        while ( regexp.match( html, index ) ) {

            //expand this link

            String link = regexp.getParen( 1 );
            
            v.addElement( link );

            index = regexp.getParenEnd( 0 );

        } 

        String result[] = new String[v.size()];
        v.copyInto( result );
        
        return result;
        
    }
    
    /**
     * Get the resource as an RSS stream with mod_content
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String getRSS() throws Exception {

        //OK... use JDOM to serialize this to RSS 1.0
        
        Namespace content_ns = Namespace.getNamespace( "content", "http://purl.org/rss/1.0/modules/content/" );
        Namespace rdf_ns = Namespace.getNamespace( "rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#" );
        Namespace rss_ns = Namespace.getNamespace( "rss", "http://purl.org/rss/1.0/" );
        
        //FIXME: pull out the site link for the item and use this for the about
        //and link for the channel.

        //root element
        Element rdf = new Element( "RDF", rdf_ns );

        String resource = this.rcs.getResource();
        String title = this.rcs.getTitle();
        String description = this.rcs.getDescription();

        //channel element
        Element channel = new Element( "channel", rss_ns );
        channel.setAttribute( "about", resource, rdf_ns );

        rdf.addContent( channel );

        //add required elements under content
        
        if ( title == null ) {

            channel.addContent( new Element( "title", rss_ns ).setText( "content for: " + resource ) );
            
        } else {

            channel.addContent( new Element( "title", rss_ns ).setText( title ) );             

        }

        channel.addContent( new Element( "link", rss_ns ).setText( resource ) );

        if ( description == null ) {

            channel.addContent( new Element( "description", rss_ns )
                                .setText( "Serialized content for the following URL: " + resource ) );

        } else {
            
            channel.addContent( new Element( "description", rss_ns )
                                .setText( description ) );

        }

        Element itemseq = new Element( "Seq", rdf_ns );
        
        //add the items element
        channel.addContent( new Element( "items" , rss_ns ).addContent( itemseq ) );

        //FIXME: some sites (washingtonpost) have different URLs pointing to the
        //same content.  We should use the title of the link to remove these
        //duplicate URLs.

        int count = 0;
        
        for ( int i = 0; i < this.links.length && count < MAX_ITEMS; ++ i ) {

            String link = this.links[i];

            try {
                
                RSSContentSerializer nrcs = new RSSContentSerializer();
                nrcs.setResource( link );
                nrcs.parse();

                if ( nrcs.getDescription() != null && nrcs.getDescription().equals( "" ) == false ) {

                    String ititle = nrcs.getTitle();

                    if ( isInvalidTitle( ititle ) ) {
                        continue;
                    } 

                    //determine if we should override the title of this article.
                    //We should also override if it is the SAME content as the
                    //master HTML file.

                    if ( ititle.length() < RSSContentSerializer.MIN_TITLE_WIDTH ||
                         isInsufficientTitle( nrcs ) ) {

                        ititle = nrcs.getTitle( nrcs.getDescription() );
                        
                    } 

                    if ( ititle == null || ititle.equals( "" ) ) {

                        ititle = nrcs.getTitle( nrcs.getDescription() ) ;
                    
                    } 

                    Element iteme = new Element( "item", rss_ns );
                    iteme.setAttribute( "about", link, rdf_ns );
                    iteme.addContent( new Element( "title", rss_ns ).setText( ititle ) )
                        .addContent( new Element( "link", rss_ns ).setText( link ) )
                        .addContent( new Element( "description", rss_ns ).setText( nrcs.getDescription() ) )
                        .addContent( new Element( "encoded", content_ns )
                                     .setText( nrcs.getContent() ) );
                
                    rdf.addContent( iteme );

                    itemseq.addContent( new Element( "li", rdf_ns )
                                        .setAttribute( "resource", link, rdf_ns ) );

                } 

            } catch ( Exception e ) {
                //this is acceptable

                e.printStackTrace();
                
            }

            //increase the total number found
            ++count;
            
        } 

        String encoding = "UTF-8";
        
        XMLOutputter outputter = new XMLOutputter( "    ", true, encoding );

        outputter.setEncoding( encoding );
        outputter.setOmitDeclaration( false );
        outputter.setOmitEncoding( false );
        
        return outputter.outputString( rdf );
        
    }

    /**
     * Return true if this article title isn't going to be used.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private boolean isInvalidTitle( String title ) {

        //HACK:
        return title.equals( "Thank you from washingtonpost.com" );

    }

    /**
     * Return true if we can't use this title...
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private boolean isInsufficientTitle( RSSContentSerializer nrcs ) {

        String title = nrcs.getTitle();

        String channelTitle = this.rcs.getTitle();
        
        //always true on null titles
        if ( title == null ) { return true; } 

        if ( channelTitle.equals( title ) ) {

            return true;
            
        } else if ( title.indexOf( channelTitle ) != -1 ) {

            return title.length() - channelTitle.length() < 5;
            
        } else {

            return false;
            
        }

    }
    
    /**
     * Set the base for this so that we know what to look for.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public void setBase( String base ) {

        this.base = base;
        
    }

    public String getBase() {
        return this.base;
    }
    
    /**
     * Display syntax.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public static void syntax() {

        System.out.println( "SYNTAX: " + RSSWebsiteFilterSerializer.class.getName() + " URL [base]" );

    }

    /**
     * Handle operations from the command line.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public static void main( String[] args ) {

        if ( args.length != 1 && args.length != 2 ) {

            syntax();

        } else {

            String resource = args[0];

            //System.out.println( "Working with resource: " + resource );

            RSSWebsiteFilterSerializer rcs = new RSSWebsiteFilterSerializer( resource );

            //System.err.println( args.length );
            
            if ( args.length == 2 ) {

                rcs.setBase( args[ 1 ] );

                System.err.println( "Using base: " + rcs.getBase() );

            } 

            try {

                rcs.parse();
                
                System.out.println( rcs.getRSS() );
                
            } catch ( Exception e ) {

                e.printStackTrace();
                
            }

        }

    }

    /**
     * 
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private void debug( String message ) {

        if ( DEBUG ) {

            System.err.println( message );
            
        } 

    }

}
