/*
 *
 * BEGIN HEADER
 *
 * ---- 
 *
 * $ID: PantherProxy.java,v 1.6 2001/06/12 20:58:43 burton Exp $
 * $Project: http://panther.openprivacy.org $
 * $CVSROOT: :pserver:anoncvs@sierra.openprivacy.org:/usr/local/cvs/public $
 * $WebCVS: http://www.openprivacy.org/cgi-bin/cvsweb/cvsweb.cgi/panther/ $
 * $Mailing-List: http://www.openprivacy.org/lists/ $
 * $Bugzilla: http://bugzilla.openprivacy.org/ $
 * Copyright 2001 OpenPrivacy.org.  All rights reserved.
 *
 * ---- 
 *
 * Copyright 2001 OpenPrivacy.org.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the LICENSE which you should have received with this package. 
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.
 *
 * END HEADER
 * 
 */

package org.openprivacy.reptile;

import java.io.*;
import java.net.*;
import java.util.*;

import javax.servlet.*;
import javax.servlet.http.*;

import org.openprivacy.reptile.actions.*;
import org.openprivacy.reptile.extensions.*;
import org.openprivacy.reptile.init.*;
import org.openprivacy.reptile.tasks.*;
import org.openprivacy.reptile.util.*;
import org.openprivacy.reptile.xslt.*;
import org.openprivacy.reptile.xml.*;

// import talon.*;
// import talon.components.*;
// import talon.resources.*;
import talon.util.*;
// import talon.util.net.*;

import org.apache.regexp.*;

import org.jdom.*;
import org.jdom.output.*;

/*

HIGH PRIORITY

- write a unit test to fix <img links that don't have SRC as the first
  attribute.

- when trying to get this title from the description.  if the match is like "A."
  then we should asusme it is an abbreviation and keep matching.

     - EXAMPLES: 
  
        Bucking an anti-war mood among their U.N.

            WASHINGTON -- As Defense Secretary Donald H. Rumsfeld asked the
            Senate not to delay a vote on an Iraq resolution until after
            Election Day, Secretary of State Colin Powell told a House panel on
            Thursday that Iraq is abusing the United Nations' good will and
            integrity. "The indictment that the president laid out didn't need
            much discussion or debate. Everybody sitting in that chamber last
            Thursday knew that Iraq stood guilty of the charges," Powell told
            the House International Relations Committee, referring to last
            week's appearance by President Bush to lay out the case against Iraq
            before the U.N. General Assembly.
        
        As Defense Secretary Donald H.

            UNITED NATIONS -- Bucking an anti-war mood among their U.N. Security
            Council partners, the United States and Britain began crafting a
            toughly worded resolution Wednesday that would narrow the timetable
            for Iraqi compliance with weapons inspections and authorize force if
            Iraq fails to cooperate, diplomats said. The two allies plan to
            complete and circulate the draft next week to the three other
            permanent members of the Security Council -- France, Russia and
            China -- diplomats told The Associated Press on condition of
            anonymity. France, Russia and Arab nations oppose a new resolution.

        Mr.

            Mr. President, I wanted to take a few minutes of leader time this
            morning, before we get into the debate on the amendment offered by
            the Senator from Texas, to talk about a concern that I have wanted
            to avoid talking about for weeks. I am very saddened by the fact
            that we have debated homeland security now for 3 weeks. I have noted
            on several occasions that there is no reason, on a bipartisan basis,
            this
            
    - put the RSS Aggregator module in here with the source URL

    - Fix the output so that it corrrectly has \n chars...

    - Make sure we remove totally illegal content

       0x7 BELL (this is actually used!)

       .. basically anything < 32 We are going to have to do string scanning...

       this should be a legalize function...

Broken URLs:

    - rmeove URLs from descritpion bodies.  This is necessary because we don't
      wnat to use long strings within the descrition since they do not wrap.

    - http://www.reuters.com/news_article.jhtml?type=worldnews&StoryID=1449497

        -  Has some Javascript within the body.

    - Wow... this is REALLY broken!  Why the hell is it missing this first
      paragraphs?

          http://www.peerfear.org/offnews/

    - Missing first paragraph!!!

        http://news.bbc.co.uk/1/hi/sci/tech/2251386.stm

    - this is REALLY REALLY BAD

        http://www.boston.com/dailyglobe2/253/business/Intel_chip_to_include_antipiracy_features+.shtml

            - it is one huge PCDATA section that should be marked junk.

    - http://www.latimes.com/news/custom/showcase/la-me-aldrinsep10.story

       WAY too much junk content!!! What is going on!!!

    - http://www.msnbc.com/news/806174.asp?cp1=1

       - missing paragraph that begins with:

         "...Scott Ritter, a former United Nations arms inspector who is in
         Baghdad, has said that there is no evidence"

         - this might work if <a> where a top level element.

         - Maybe I should just change he parser to look for flat arrays of
           PCDATA that don't have to be broken.  like:

             <a>
             <br>

             etc...

             instead of terminated elements.

         - run test and then add this as a unit test.
             
    - http://neptune.spaceports.com/~words/beavis.html

    - Some sites complain that my browsers is configured not to accept cookies.

    - Isn't able to include K5 commentary.

    http://www.satirewire.com/news/aug02/hasta.shtml

        - still dropping the intro paragraph :(

    http://news.nationalgeographic.com/news/2002/08/0823_020823_asteroid.html

        - are we not correctly removing the # char?

        - the problem here looks like we need another failover with
          acceptableInnerElements within firstLevelElements

        - browsers don't pass # chars?

    http://www.msnbc.com/news/677951.asp

        - now it totaly breaks.  This is a big problem.

        - it is a parsing problem. I should refactor (or at least consider it)
          the way I find PCDATASectionions.  I think we are running past my
          indexes somehow.

        - need to put back the index to the last element when it FAILS
          
    - Don't use global data for the parse index.

    - I need to get rid of <script> sections because this can also fuck us up
      because of embedded crap...

     - http://news.com.com/2010-1071-954964.html?tag=fd_nc_1

         - fix isJunkContent so tht it

           also includes CDATA shorter than 5 characters

           - returns true if 80% of the content is junk

               - I can do this by matching on ..... then seeing what percentage
                 this is.

                  <b class="a2">

                    <a href="/2020-1069-0.html">News.context:</a>

                    <a href="/2018-1070-0.html">Special Reports</a>
                    
                     | 
                    
                    <a href="/2005-1082-0.html">Newsmakers</a>

                     | 
                    
                    <span class="g3">Perspectives</span>
                    
                  </b>                 
           
FIXME:

- Cleans blockquotes on Advogato where it is used int the original format but
  not needed within mod_content

- We should drim prefixed <brs>

   <br>
   <br>
   <br>
   <br>
   <p>
   Hello world!
   </p>

- Another optimization: it might be possible to find large junk sections int the
  middle of the HTML and rip them out if they are within tables.

    - http://www.msnbc.com/news/806174.asp?cp1=1

- It should be possible to remove the junk that can arise within the middle of
an article.  Example:

   pcdata

   junk

   pcdata

   links (which would normally be junk but are probably links to further info)

   We can clean up the middle junk when complete

- it should be possible to further narrow down bad sections... for if we have
  some pcdata like:

    <b>Hello World</b> <b>Hello World</b> <b>Hello World</b>

    right now this would be acceptable but ONLY if we trimmed it.  It should be
    possible to look at the length of CDATA between elements and NONE are
    greater than MIN_CDATA_LENGTH we should drop it.

    This will still pull out long paragraphs of text though.

- What do I want to do about CSS inclusion? I would have to write a CSS parser?

    - fuck it.. I don't care!  I should remove class attributes though.

    - We should also remove <style> elements

- It is dropping paragraphs because they are doing things like:

    http://www.xml.com/pub/a/2002/08/14/deviant.html

    <p>

        <table>

        </table>

        Missed chapter
        
    </p>

    <p>

        Recognized chapter

    </p>

    - I could probably fix this byte waiting until something fails and then look
      for internal CDATA byte stripping it and then checking ifit is a first level
      element.

      - actually I don't think this is a good idea because we could strip the
        whole document on a low level element.

- Implement some sort of "page spanning" feature that allows me to hit the
  "next" button on pages so that I can accumulate that text too.

- expand . within links.  like:

    ./foo.html

- It should be possible to use this mechanism with shitty (AKA RSS 0.9x) RSS
   feeds to uplevel them and make better RSS output.

- Write an algorithm to complete elements... so for example if someone
  syndicates a <b> element with no ending we should just stick a </b> at the end
  of our content.

- Abillity to explicitly set modes on the serializer.

- The evhead site does not parse after I serialize it to a channel.

- Update the user agent so that it isn't 'Java' but is Mozilla.

- Need a servlet to explicitly enable A ref output.  This would work well for
  advogato and other such sites.

  oh... no... tokenize the document based on <a name="">
  
- Sometimes paragraphs are stored under presentation items like 'li'.  We should
  also include these:

      http://xmlhack.com/read.php?item=1749

      - example:

      <li>

          <p>
          Hello
          </p>
          
      </li>

      - Maybe I should just add acceptableInnerElements to the list of
        firstLevelElements.  I mean it is acceptable if one of these matches.

- INCLUDE_ACCEPTABLE_WITHIN_FIRSTLEVEL

    Seems to work.  I need to close all my elements.  I also think that
    isJunkContent has a bug which is giving too much info.  Enable this int the
    future once things settle down.

*/

/**
 * Handles serializing HTML content to RSS 1.0
 *
 * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
 */
public class RSSContentSerializer {

    public static final String COMPONENT_VERSION = "1032684403";
    
    public static final String USER_AGENT_STRING  =
        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.1) Gecko/20020826";
    
    /**
     *
     */
    public static final int MIN_TITLE_WIDTH = 20;

    /**
     *
     */
    public static final int MAX_TITLE_WIDTH = 200;

    /**
     * Minimum amount of junk data in PCDATA after which we consider the whole
     * thing junk.
     *
     */
    public static final int MIN_JUNK_DATA_PERCENTAGE = 80;
    
    /**
     * If true, acceptable elements are also considered first level elements.
     */
    public static final boolean INCLUDE_ACCEPTABLE_WITHIN_FIRSTLEVEL = false;
    
    /**
     * Minimal parse mode.  Less chance of failure and false positive.
     */
    public static final int MODE_MINIMAL = 0;

    /**
     * Flexible parse mode.  More chance of failure and false positive but works
     * with sites that syndicate content with <br> and <td> tags.  Should only
     * be used if MODE_MINIMAL doesn't work.
     */
    public static final int MODE_FLEXIBLE = 1;

    /**
     * Mode for matching A name sections.
     */
    public static final int MODE_ANCHOR = 2;

    /**
     * Keep adding stripped PCDATA to the description until it is at least this
     * width.
     */
    public static final int MIN_DESCRIPTION_LENGTH = 500;

    public static final int MAX_DESCRIPTION_LENGTH = 900;

    /**
     * Variable used to detect if a piece of text is valid CDATA.  If a piece of
     * text is < MIN_CDATA_LENGTH we do not consider it acceptable.
     *
     * HISTORY:
     *
     * - 20 - seems to work on most places but fails in some.
     * 
     * - 50 - required for sites that syndicate links to other articles prior to
     * the real content.  The only problem with this larger value is it skips
     * small valid links like: "<p>This is the title</p>" but I believe this is
     * acceptable
     * 
     */
    public static final int MIN_CDATA_LENGTH = 50;

    /**
     * When true we enable debug mode which prints out information about
     * processing.  This code is somewhat complex and difficult to understand so
     * this is necessary on URLs that are broken.  Keeping this a static final
     * variable should allow the compiler to remove this code.
     */
    public static final boolean DEBUG = false;

    public static final boolean INCLUDE_FORMS = false;

    private String site = null;
    
    private String html = null;
    
    private String title = null;
    
    private String description = "";

    /**
     * Stores all found pcdata as strings.
     */
    private Vector vpcdata = new Vector();

    /**
     * Holds the resource we are trying to fetch...
     */
    private String resource = null;

    /**
     * Stores the result of parsing operations
     */
    private StringBuffer content = new StringBuffer();

    private Vector firstLevelElements = new Vector();

    private Vector acceptableInnerElements = new Vector();

    private boolean foundNonJunkContent = false;

    private int mode = MODE_MINIMAL;

    /**
     * if the resource is a URL to a HTML document with an anchor, we save the
     * anchor name here.  Example: http://www.peerfear.org/#anchor where the
     * variable value will be anchorName
     */
    private String anchorName = null;
    
    private boolean initialized = false;

    /**
     * 
     * Create a new <code>RSSContentSerializer</code> instance.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public RSSContentSerializer() {

        //init first level elements
        
        firstLevelElements.addElement( "p" );
        firstLevelElements.addElement( "pre" );
        firstLevelElements.addElement( "h1" );
        firstLevelElements.addElement( "h2" );
        firstLevelElements.addElement( "h3" );
        firstLevelElements.addElement( "h4" );
        firstLevelElements.addElement( "blockquote" );
        firstLevelElements.addElement( "ul" );
        firstLevelElements.addElement( "ol" );
        firstLevelElements.addElement( "dl" );
        firstLevelElements.addElement( "b" );
        firstLevelElements.addElement( "strong" );
        firstLevelElements.addElement( "span" );

        //FIXME: this is needed because of named anchors.
        //firstLevelElements.addElement( "a" );
        addAcceptableInnerElement( "b" );
        addAcceptableInnerElement( "img" );
        addAcceptableInnerElement( "a" );
        addAcceptableInnerElement( "i" );
        addAcceptableInnerElement( "font"); 
        addAcceptableInnerElement( "blockquote" );
        addAcceptableInnerElement( "span" );
        addAcceptableInnerElement( "div" );
        addAcceptableInnerElement( "em" );
        addAcceptableInnerElement( "br" );
        addAcceptableInnerElement( "ul" );
        addAcceptableInnerElement( "ol" );
        addAcceptableInnerElement( "li" );
        addAcceptableInnerElement( "pre" );
        addAcceptableInnerElement( "dl" );
        addAcceptableInnerElement( "dd" );
        addAcceptableInnerElement( "dt" );
        addAcceptableInnerElement( "code" );
        addAcceptableInnerElement( "ins" );
        addAcceptableInnerElement( "del" );
        addAcceptableInnerElement( "q" );
        addAcceptableInnerElement( "quote ");
        addAcceptableInnerElement( "strong" );
        addAcceptableInnerElement( "abbr" );
        addAcceptableInnerElement( "acronym" );
        addAcceptableInnerElement( "cite" );
        addAcceptableInnerElement( "samp" );
        addAcceptableInnerElement( "sub" );
        addAcceptableInnerElement( "sup" );
        addAcceptableInnerElement( "u" );
        addAcceptableInnerElement( "nitf" );
        addAcceptableInnerElement( "xmp" );
        addAcceptableInnerElement( "var" );
        addAcceptableInnerElement( "kbd" );
        addAcceptableInnerElement( "dfn" );
        addAcceptableInnerElement( "big" );
        addAcceptableInnerElement( "tt" );
        addAcceptableInnerElement( "strike " );
        addAcceptableInnerElement( "s" );
        addAcceptableInnerElement( "br" );
        addAcceptableInnerElement( "p" ); 
        addAcceptableInnerElement( "h1" );
        addAcceptableInnerElement( "h2" );
        addAcceptableInnerElement( "h3" );
        addAcceptableInnerElement( "h4" );
        addAcceptableInnerElement( "nobr" );
        addAcceptableInnerElement( "wbr" );
        addAcceptableInnerElement( "address" );
        addAcceptableInnerElement( "fieldset" );
        addAcceptableInnerElement( "legend" );

        //form related elements
        if ( INCLUDE_FORMS ) {
            
            addAcceptableInnerElement( "form" ); 
            addAcceptableInnerElement( "input" ); 
            addAcceptableInnerElement( "textarea" ); 
            addAcceptableInnerElement( "button" ); 
            addAcceptableInnerElement( "select" ); 
            addAcceptableInnerElement( "option" ); 
            addAcceptableInnerElement( "optgroup" ); 

        } 

    }

    /**
     * 
     * Get the value of <code>html</code>.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String getHTML() { 
        
        return this.html;
        
    }

    /**
     * 
     * Set the value of <code>html</code>.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public void setHTML( String html ) { 
        
        this.html = html;
        
    }
    
    /**
     * 
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String getResourceAsString() throws Exception {

        URL url = new URL( resource );

        URLConnection conn = url.openConnection();

        conn.setRequestProperty( "User-Agent", USER_AGENT_STRING );
        
        String contentType = conn.getContentType();
        
        if ( contentType != null && contentType.indexOf( "text/html" ) == -1 ) {

            throw new Exception( "Only HTML content is supported and the following content type was detected: " +
                                 conn.getContentType() );
            
        } 

        InputStream is = conn.getInputStream();

        //update the resource if this URL was redirected
        this.resource = conn.getURL().toExternalForm();
        
        return InputStreamUtils.toString( is );
        
    }

    /**
     * 
     * Get the value of <code>title</code>.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String getTitle() { 
        
        return this.title;
        
    }

    /**
     * 
     * Set the value of <code>title</code>.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public void setTitle( String title ) { 
        
        this.title = title;
        
    }

    /**
     * 
     * Get the value of <code>description</code>.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String getDescription() { 

        try {

            description = cleanseEntities( cleanseLeadingGarbage( cleanseDate( this.description ) ) );
            
        } catch ( Exception e ) {}

        return description;
        
    }

    /**
     * 
     * Set the value of <code>description</code>.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public void setDescription( String description ) { 
        
        this.description = description;
        
    }

    /**
     * 
     * Get the value of <code>resource</code>.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String getResource() { 
        
        return this.resource;
        
    }

    /**
     * 
     * Set the value of <code>resource</code>.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public void setResource( String resource ) { 

        //System.err.println( "resource: " + resource );

        //if the link contains # we need to remove this
        if ( resource.indexOf( "#" ) != -1 ) {

            this.anchorName = resource.substring( resource.indexOf( "#" ) + 1,
                                                  resource.length() );
            
            this.resource = resource.substring( 0, resource.indexOf( "#" ) );

            debug( "anchorName: " + anchorName );
            debug( "resource: " + this.resource );

        } else {

            this.resource = resource;
             
        }

    }

    /**
     * Reset this parse so that all state variables are back to their defaults.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private void resetParse() {

        //reset everything

        this.description = "";
        
        this.vpcdata = new Vector();

        this.content = new StringBuffer();

        this.firstLevelElements = new Vector();

        this.foundNonJunkContent = false;

    }

    /**
     * 
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private void parseSecondary() throws Exception {

        debug( "=================================================================" );
        debug( "RESORTING TO SECONDARY PARSE" );
        debug( "=================================================================" );

        resetParse();
            
        //add more first level elements
        firstLevelElements.addElement( "td" );
        firstLevelElements.addElement( "br" );
        firstLevelElements.addElement( "div" );

        //acceptableInnerElements.removeElement( "br" );

        this.parseHTML( this.html );

    }
    
    /**
     * 
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private void parseAcceptableInnerElements() throws Exception {

        debug( "=================================================================" );
        debug( "RESORTING TO acceptableInnerElements PARSE" );
        debug( "=================================================================" );
            
        resetParse();
            
            Enumeration enum = acceptableInnerElements.elements();

        while ( enum.hasMoreElements() ) {

            firstLevelElements.addElement( enum.nextElement() );
                
        } 

        this.parseHTML( this.html );

    }

    /**
     * Parse out the HTML from a given anchoR.  This mode is VERY easy compared
     * to other modes.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private void parseAnchorMode() throws Exception {

        //FIXME: how do we pull out the title?
        
        debug( "=================================================================" );
        debug( "ANCHOR MODE PARSE" );
        debug( "=================================================================" );

        //FIXME: this won't work with anchors that look like regexps
        RE regexp = new RE( "<a name=\"" + this.anchorName + "\"", RE.MATCH_CASEINDEPENDENT | RE.MATCH_MULTILINE );

        int parseIndex = 0;
        
        if ( regexp.match( html ) ) {

            int begin = regexp.getParenStart( 0 );

            //now look for the end which is going to be the next A or </body>

            //FIXME: we should search for another anchor... if it doesn't exist,
            //search for the something that ISN'T a first level element or 
            
            RE endr = new RE( "<a name=", RE.MATCH_CASEINDEPENDENT );

            int end = -1;
            
            if ( endr.match( html, regexp.getParenEnd( 0 ) ) ) {

                end = endr.getParenStart( 0 );

            } else {

                endr = new RE( "<([^/> ]+)", RE.MATCH_CASEINDEPENDENT  );

                parseIndex = regexp.getParenEnd( 0 );

                String local_name;
                
                while ( endr.match( html, parseIndex ) ) {

                    parseIndex = endr.getParenEnd( 0 );

                    local_name = endr.getParen( 1 ).toLowerCase();

                    if ( isAcceptableInnerElement( local_name ) == false &&
                         isFirstLevelElement( local_name ) == false ) {

                        end = endr.getParenStart( 0 );
                        
                        break;
                    } 

                }

            }

            String pcdata = cleansePCDATA( stripnbsp( html.substring( begin, end ) ) );
            String stripped = strip( pcdata );
            
            debug( "SUCCESS:" );
            
            debug( pcdata );
            
            this.vpcdata.addElement( new PCDATASection( pcdata, stripped, begin, end ) );
            
            this.content.append( pcdata );

        } else {

            throw new Exception( "Anchor not found: " + this.anchorName );
            
        }

    }

    /**
     * Initialize this if it hasn't been done.  All initialization does is fetch
     * the HTML for this serializer.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public void init() throws Exception {

        //this the resource as a string.

        html = this.getResourceAsString();

        //cleanse the html so that we don't have any comments
        html = cleanseHTML( html );

        this.initialized = true;
        
    }
    
    /**
     * Parse this channel.  This should be called before any other methods that
     * return any data.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public void parse() throws Exception {

        if ( this.initialized == false ) {

            this.init();
            
        } 

        int minRepassContentLength = this.getMinRepassContentLength();

        if ( this.anchorName != null ) {

            this.parseAnchorMode();
            
        } else {
            
            this.parseHTML( html );

            if ( this.getContentStrippedLength() < minRepassContentLength ) {

                this.parseSecondary();
            
            } 

            if ( this.getContentStrippedLength() < minRepassContentLength &&
                 INCLUDE_ACCEPTABLE_WITHIN_FIRSTLEVEL == false ) {

                this.parseAcceptableInnerElements();
            
            } 

        }
        
        //After parsing, make sure we have a minimum description length.
        if ( this.description.length() < MIN_DESCRIPTION_LENGTH ) {

            PCDATASection[] sections = getPCDATASections();

            int index = 0;

            while ( index < sections.length &&
                    this.description.length() < MIN_DESCRIPTION_LENGTH ) {

                String stripped = sections[index].stripped;

                if ( stripped != null && this.description.indexOf( stripped ) == -1 ) {

                    //make sure the stripped content ends in a sentence terminator ( . ? !)
                    this.description += " " + stripped; 

                } 
                
                ++index;
                
            }

        } 

        this.description = truncate( this.description, MAX_DESCRIPTION_LENGTH );

        //FIXME: what do we do if the title is < MIN_TITLE_WIDTH ?  Right now I
        //am handling this in the implementing parser..

        //relativize the content.
        this.content = new StringBuffer( WellFormedContentParser.parse( relativize( content.toString() ) ) );
            
        debug( "Found the following number of PCDATA sections: " + this.vpcdata.size() );
        debug( "Content size: " + this.getContent().length() );
        debug( "Content Stripped Length: " + this.getContentStrippedLength() );
        debug( "Description size: " + this.description.length() );
        debug( "Min Repass Content Length: " + minRepassContentLength );
    }
        
    /**
     * Get the resource as an RSS stream with mod_content
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String getRSS() throws Exception {

        //OK... use JDOM to serialize this to RSS 1.0
        
        Namespace content_ns = Namespace.getNamespace( "content", "http://purl.org/rss/1.0/modules/content/" );
        Namespace rdf_ns = Namespace.getNamespace( "rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#" );
        Namespace rss_ns = Namespace.getNamespace( "rss", "http://purl.org/rss/1.0/" );
        Namespace ag_ns = Namespace.getNamespace( "ag", "http://purl.org/rss/modules/aggregation/" );
        Namespace link_ns = Namespace.getNamespace( "link", "http://purl.org/rss/modules/link/" );
        
        //pull out the site link for the item and use this for the about and
        //link for the channel.

        //root element
        Element rdf = new Element( "RDF", rdf_ns );

        //channel element
        Element channel = new Element( "channel", rss_ns );
        channel.setAttribute( "about", this.getSite(), rdf_ns );

        rdf.addContent( channel );

        //add required elements under content

        String link = this.resource;

        //we have to include the anchor name withink the link
        if ( anchorName != null ) {

            link += "#" + anchorName;
            
        } 

        if ( title == null ) {

            channel.addContent( new Element( "title", rss_ns ).setText( "content for: " + resource ) );
            
        } else {

            channel.addContent( new Element( "title", rss_ns ).setText( title ) );             

        }

        channel.addContent( new Element( "link", rss_ns ).setText( this.getSite() ) );

        if ( description == null ) {

            channel.addContent( new Element( "description", rss_ns )
                                .setText( "Serialized content for the following URL: " + resource ) );

        } else {
            
            channel.addContent( new Element( "description", rss_ns )
                                .setText( description ) );

        }

        //add the items element
        channel.addContent( new Element( "items" , rss_ns )
                             .addContent( new Element( "Seq", rdf_ns )
                                          .addContent( new Element( "li", rdf_ns )
                                                       .setAttribute( "resource", resource, rdf_ns ) ) ) );

        //add the item to the channel
        Element item = new Element( "item", rss_ns );
        item.setAttribute( "about", resource, rdf_ns );

        rdf.addContent( item );

        if ( title == null ) {

            item.addContent( new Element( "title", rss_ns ).setText( resource ) );

        } else {

            item.addContent( new Element( "title", rss_ns ).setText( title ) );
            
        }

        item.addContent( new Element( "link", rss_ns ).setText( link ) );

        if ( this.description == null ) {

            item.addContent( new Element( "description", rss_ns ).setText( resource ) );

        } else {

            item.addContent( new Element( "description", rss_ns ).setText( description ) );

        }

        //FIXME: add the source moD_link
        
        //item.addContent( new Element( "description", rss_ns ).setText( description ) );
        
        Element encoded = new Element( "encoded", content_ns );

        //debug( "FOUND THE FOLLOWING PARSED HTML " );

        //debug( content.toString() );

        encoded.setText( XMLStringCleanser.cleanse( content.toString() ) );

        item.addContent( encoded );

        //NOTE: I don't think it is a good idea to ALWAYS use ISO-8859-1 because
        //we might be fetching content that is UTF-8 or another charset.
        String encoding = "ISO-8859-1";
        
        XMLOutputter outputter = new XMLOutputter( "    ", true, encoding );

        //outputter.setTextNormalize( true );
        outputter.setEncoding( encoding );
        outputter.setOmitDeclaration( false );
        outputter.setOmitEncoding( false );
        
        return outputter.outputString( new Document( rdf ) );
        
    }

    /**
     * Given some HTML, parse it out and return the result as thin html.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private void parseHTML( String html ) throws Exception {

        String local_name = "";
        String pcdata = "";
        
        RE firstr = new RE( "<([^/> ]+)", RE.MATCH_CASEINDEPENDENT  );

        int parseIndex = 0;
        
        while ( firstr.match( html, parseIndex ) ) {

            local_name = firstr.getParen( 1 ).toLowerCase();

            //update index
            parseIndex = firstr.getParenEnd( 0 );

            //"br".equals( local_name ) ||
            //"td".equals( local_name ) ||

            //handle title parsing...
            if ( local_name.equals( "title" ) && this.title == null ) {

                String title = html.substring( parseIndex + 1,
                                               html.indexOf( "</", parseIndex ) );

                this.title = this.cleanseTitle( strip( title ) );
                    
            }

            if ( isFirstLevelElement( local_name ) ) {

                PCDATASection result  = getPCDATA( html, local_name, parseIndex );

                //FIXME: an NPE is being thrown here"

                //at org.openprivacy.reptile.RSSContentSerializer.parseHTML(RSSContentSerializer.java:1095)
                //at org.openprivacy.reptile.RSSContentSerializer.parse(RSSContentSerializer.java:867)
                //at org.peerfear.offnews.TaskThread.article(TaskThread.java:115)
                //at org.peerfear.offnews.TaskThread.run(TaskThread.java:75)

                if ( parseIndex > 0 )
                    parseIndex = result.end; //update the parseindex from the result.

                pcdata = result.pcdata;
                String stripped = result.stripped;
                
                String holder_start = "<" + local_name + ">";
                String holder_end = "</" + local_name + ">";

                if ( isHolderElement( local_name ) == false ) {

                    holder_start = "<p><" + local_name + ">";
                    holder_end = "</" + local_name + "></p>";

                } 

                //We should ONLY include content, when *stripped* it is > MIN_CDATA_LENGTH
                if ( this.isAcceptablePCDATA( result ) &&
                     ( this.isJunkContent( pcdata ) == false || foundNonJunkContent ) ) {

                    debug( "ACCEPTED: " + local_name );

                    this.foundNonJunkContent = true;

                    this.content.append( "<!-- BEGIN PCDATA SECTION " );
                    this.content.append( "begin: " + result.begin + " " );
                    this.content.append( "end: " + result.end + " " );
                    this.content.append( "pcdata-length: " + result.pcdata.length() + " " );
                    this.content.append( "stripped-length: " + result.stripped.length() );
                    this.content.append( " -->\n" );
                    
                    this.content.append( "\n" + holder_start+ "\n" );
                    
                    //FIXME: add the ability to 'fill' the pcdata to column 80
                    this.content.append( pcdata );

                    this.content.append( "\n" + holder_end + "\n" );

                    //fix descriptions.
                    if ( this.description.length() < MIN_DESCRIPTION_LENGTH ) {

                        this.description += " " + stripped;

                        debug( "New description is: " + this.description.length() );
                        debug( this.description );

                    }

                } else if ( stripped != null && stripped.length() > 0 ) {

                    debug( "REJECTED: The pcdata was rejected as it was not found acceptable. - " + stripped.length() + ": " );

                    debug( "----" );
                    
                    debug( "PCDATA: " );
                    debug( pcdata );

                    debug( "STRIPPED: " );
                    debug( stripped );

                    debug( "----" );

                }

            } 

        } 

    }

    /**
     *
     * Get CDATA from the current element.  The given `local-name' is the local
     * name of the current element.  In order to complete we need to match
     * </local-name>.  Note that this is not really CDATA as HTML can include
     * other information in paragraphs that we are going to want to include
     * (img, span, div, b, i, etc).  We will return nil if the given node is not
     * valid.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private PCDATASection getPCDATA( String html,
                                     String local_name,
                                     int parseIndex ) throws Exception {

        //goto end of element decl
        RE eer = new RE( ">" );

        if ( eer.match( html, parseIndex ) ) {
            
            parseIndex = eer.getParenEnd( 0 );
            
            String current_local_name = "";

            int begin = parseIndex;

            PCDATASection result = null;

            RE regexp = new RE( "</?([^/> ]+)([ ]?/>)?" );

            //current level of nesting.  everytime we go into an XML element we
            //increment it and when we leave an element we decrement it
            int nesting = 1; //we have to start at one because we are already
                             //within an element at this level.
            
            while ( regexp.match( html, parseIndex ) ) {

                parseIndex = regexp.getParenEnd( 0 );
                
                current_local_name = regexp.getParen( 1 ).toLowerCase();

                //self terminating element regexp
                RE selfTerminatingRegexp = new RE( "/>$" );

                //System.err.println( regexp.getParen( 0 ) );

                //FIXME: should we only continue on acceptableInnerElements ?
                if ( selfTerminatingRegexp.match( regexp.getParen( 0 ) ) ) {
                    continue; //ignore self terminating elements.
                } 

                RE terminatingRegexp = new RE( "^</" );

                //increment oR decrement.. 
                if ( terminatingRegexp.match( regexp.getParen( 0 ) ) ) {
                    --nesting;
                } else {
                    ++nesting;
                }
                
                //if this isn't an acceptable inner element... IE table, body,
                //crap like that.
                //
                // or
                //
                // the current name is the local name </p> or it is a firstLevelElement <p>

                if ( foundAllPCDATA( local_name, current_local_name, nesting ) ) {

                    debug( "local_name: " + local_name );
                    debug( "current_local_name: " + current_local_name );
                    
                    //update the parseIndex.  We need to set this to the
                    //beginning of the element match

                    int end = regexp.getParenStart( 0 );                         

                    //though if it is acceptable... include this...
                    if ( isAcceptableInnerElement( current_local_name ) ) {
                        end = regexp.getParenEnd( 0 ) + 1; // +1 because we need the ending > char..
                    } 

                    String pcdata = cleansePCDATA( stripnbsp( html.substring( begin, end ) ) ).trim();

                    return section( pcdata, begin, end );
                    
                }
                
            }

        }

        return null;
        
    }

    /**
     * Return true if we have found ALL PCDATA within this current parse.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private boolean foundAllPCDATA( String local_name, String current_local_name, int nesting ) {

        if ( isAcceptableInnerElement( current_local_name ) == false ) {
            return true; //found some unacceptable data
            
        }

        if ( nesting <= 0 ) {

            if ( local_name.equals( current_local_name ) ) {
                return true;
                
            } 

            if ( isFirstLevelElement( current_local_name ) ) {
                return true;

            } 

        } 

        return false;

    }
    
    /**
     * Update the description, validate that this is really a successful parse
     * (stripped has a decent length, etc)
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private PCDATASection section( String pcdata, int begin, int end ) throws Exception {

        String stripped = strip( pcdata );

        //pcdata = fixRelativeLinks( pcdata );

        //if we strip this item of elements and there is no
        //content we should not return any pcdata because all we
        //would be returning would be markup
        if ( stripped == null ) {

            debug( "FAILED" );
            debug( "begin: " + begin );
            debug( "end: " + end );
            debug( pcdata );
                            
            return new PCDATASection( "", "", begin, end );
                            
        } else {

            debug( "SUCCESS on the following pcdata: " );
            debug( "begin: " + begin );
            debug( "end: " + end );
            debug( pcdata );

            debug( "SUCCESS - stripped: " );
            debug( stripped );

            PCDATASection result = new PCDATASection( pcdata, stripped, begin, end );
            
            //add this to the found pcdata sections
            this.vpcdata.addElement( result );

            return result;

        }

    }
    
    /**
     * Strip all elements from the given content.  If we strip everything out
     * and there is no content (only markup) we return null.  We also strip out
     * &nbsp and replace them with " " and then trim the entire string.
     *
     * We also normalize the entire string.  Duplicate spaces are replaced with
     * a single space.  Duplicate \n chars are replcated with a single \n
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String strip( String content ) throws Exception {

        RE ematch = new RE( "</?[^>]+>", RE.MATCH_CASEINDEPENDENT );

        boolean matched = false;

        //first.. get rid of all the &nbsp spaces.
        content = stripnbsp( content ).trim();

        //use at least one space so that paragraphs don't end up exactly back-to-back...
        content = ematch.subst( content, " " ).trim();

        content = normalize( content );

        //make sure we have data...  This happens when we go through stripping
        //regexp yet we still have nothing.
        if ( content.equals( "" ) ) {

            content = null;
            
        }

        return content;
        
    }

    /**
     * 
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private String normalize( String content ) throws Exception {

        //now normalize spaces...
        RE regexp = new RE( "[ ]+", RE.MATCH_CASEINDEPENDENT );
        content = regexp.subst( content, " " ).trim();

        //now normalize \n
        regexp = new RE( "[\n]+", RE.MATCH_CASEINDEPENDENT );

        content =  regexp.subst( content, "\n" ).trim();

        return content;
        
    }
    
    private String stripnbsp( String content ) throws Exception {

        RE nbspr = new RE( "&nbsp;" );

        return nbspr.subst( content, " " );

    }

    /**
     * Return true if this is a first level element, paragraphs, etc.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private boolean isFirstLevelElement( String local_name ) {

        return this.firstLevelElements.contains( local_name );

    }

    /**
     * 
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private boolean isAcceptableInnerElement( String current_local_name ) {

        return this.acceptableInnerElements.contains( current_local_name );
        
    } 

    /**
     * Add the given element to the list of acceptableInnerElements
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private void addAcceptableInnerElement( String name ) {

        acceptableInnerElements.addElement(  name );

        if ( INCLUDE_ACCEPTABLE_WITHIN_FIRSTLEVEL ) {

            firstLevelElements.addElement( name );
            
        } 

    }
        
    /**
     * Used to fix relative links in HTML content so that everything is
     * expanded.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String relativize( String content ) throws Exception {

        //FIXME: if the URL contains a scheme, don't include it.
        //
        // Example:  http://www.cnn.com/javascript:LaunchVideo('/popup/section_framesets/showbiz.frameset.','300k')
        
        String base = this.getBase();

        String site = this.getSite();

        StringBuffer buff = new StringBuffer();
        
        int index = 0;
        
        RE elementr = new RE( "<(img|a)[^>]*>", RE.MATCH_CASEINDEPENDENT | RE.MATCH_MULTILINE );

        RE attributer = new RE( "(src|href)=[\"']([^\"']+)[\"'][^>]*/?>", RE.MATCH_CASEINDEPENDENT | RE.MATCH_MULTILINE  );

        String local_name;
        int bound;
        
        while ( elementr.match( content, index ) && index != -1 ) {

            local_name = elementr.getParen( 1 );

            bound = elementr.getParenEnd( 1 );

            if ( attributer.match( content, elementr.getParenStart( 1 ) ) ) {
                
                String slink = attributer.getParen( 2 );

                //add this to the buffer.

                buff.append( content.substring( index, attributer.getParenStart( 2 ) ) );

                slink = expand( slink );

                buff.append( slink );

                //else complete this link

                buff.append( content.substring( attributer.getParenEnd( 2 ),
                                                attributer.getParenEnd( 0 ) ) );
                
                index = attributer.getParenEnd( 0 );

            } else {

                index = elementr.getParenEnd( 0 );

            }

        }

        //add the rest of the content
        buff.append( content.substring( index, content.length() ) );
        
        return buff.toString();

    }

    /**
     * Display syntax.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private static void syntax() {

        System.out.println( "SYNTAX: org.org.openprivacy.reptile.ContentServlet URL" );

    }

    /**
     * Handle operations from the command line.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public static void main( String[] args ) {

        if ( args.length != 1 ) {

            syntax();

        } else {

            String resource = args[0];

            //System.out.println( "Working with resource: " + resource );

            RSSContentSerializer rcs = new RSSContentSerializer();
            rcs.setResource( resource );

            try {

                rcs.parse();
                
                System.out.println( rcs.getRSS() );
                
            } catch ( Exception e ) {

                e.printStackTrace();
                
            }

        }

    }

    /**
     * Get all PCDATA entries that were found.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public PCDATASection[] getPCDATASections() {

        PCDATASection result[] = new PCDATASection[ vpcdata.size() ];
        vpcdata.copyInto( result );

        return result;
        
    }

    /**
     * 
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private void debugAlways( String message ) {

        System.err.println( message );

    }

    /**
     * 
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private void debug( String message ) {

        if ( DEBUG ) {

            System.err.println( message );
            
        } 

    }

    /**
     * Return all the content for this item.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String getContent() {

        return this.content.toString();

    }
    
    /**
     * Get the base of this URL.  For example if we are given:
     *
     * http://www.foo.com/directory/index.html
     *
     * we will return
     *
     * http://www.foo.com/directory
     *
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String getBase() {

        int begin = "http://".length() + 1;
        
        int end = resource.lastIndexOf( "/" );
        
        if ( end == -1 || end <= begin ) {
            
            //probaby a URL like http://www.cnn.com
            
            end = resource.length();
            
        } 

        return resource.substring( 0, end );
        
    } 

    /**
     * Get the site for this resource.  For example:
     *
     * http://www.foo.com/directory/index.html
     *
     * we will return
     *
     * http://www.foo.com
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String getSite() {

        if ( site == null ) {

            int end = resource.indexOf( "/", 8 );

            if ( end == -1 ) {

                end = resource.length();

            } 

            //start at 8 which is the width of http://
            site = resource.substring( 0, end );

        } 

        return site;
        
    }

    /**
     * Expand a link relavant to the current site.  This takes care of links
     * such as
     *
     * /foo.html -> http://site.com/base/foo.html
     *
     * foo.html -> http://site.com/base/foo.html
     *
     * Links should *always* be expanded before they are used.
     *
     * Note that all resource URLs will have correct trailing slashes.  If the URL
     * does not end with / then it is a file URL and not a directory.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */

    public String expand( String link ) throws Exception {

        if ( link.startsWith( "/" ) ) {
            
            link = this.getSite() + link;

        } else if ( link.startsWith( "#" ) ) {

            link = this.resource + link;
             
        } else if ( link.startsWith( ".." ) ) {
            
            //ok.  We need to get rid of these .. directories.

            RE regexp = new RE ( "^\\.\\./?(.*)$", RE.MATCH_CASEINDEPENDENT );

            RE baseRegexp = new RE ( "^(.*/)[^/]+/$", RE.MATCH_CASEINDEPENDENT );

            String base = resource;
            
            while ( regexp.match( link ) ) {

                //get rid of the first previous dir in the link
                link = regexp.getParen( 1 );

                //get rid of the last directory in the resource

                if ( baseRegexp.match( base ) ) {

                    base = baseRegexp.getParen( 1 );
                    
                } 

            }

            link = base + link;

        } else if ( link.startsWith( "http://" ) == false ) {

            String base = this.getBase();

            link = base + "/" + link;

        } 

        return link;
        
    }

    /**
     * Return true if this is junk content.  For example if it only contain <a>
     * links.  This works very similar to #isAcceptablePCDATA but the main
     * difference is that this is <b>much</b> more picky and tries to avoid
     * false positives at all costs.  If isJunkContent does return true on a
     * valid pcdata section we would not include it and this would be bad thing.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public boolean isJunkContent( String content ) throws Exception {

        content = normalize( content );

        //get rid of all elments accept A links

        int index = 0;

        RE regexp = new RE( "</?([a-zA-Z]+)[^>]+>", RE.MATCH_CASEINDEPENDENT | RE.MATCH_MULTILINE );

        while ( regexp.match( content, index ) ) {

            String local_name = regexp.getParen( 1 ).toLowerCase();

            if ( local_name.equals( "a" ) == false ) {

                content = content.substring( 0, regexp.getParenStart( 0 ) ).trim() +
                          content.substring( regexp.getParenEnd( 0 ), content.length() ).trim();

                index = regexp.getParenStart( 0 );
                
            } else {

                index = regexp.getParenEnd( 0 );

            }
            
        } 

        regexp = new RE( "\n" );
        content = regexp.subst( content, "" );

        //make sure elements are back-to-back
        regexp = new RE( "> <" );
        content = regexp.subst( content, "><" );

        //match anchors with an optional 5 characters after it.
        regexp = new RE( "(<a [^>]+>[^<]+</a>)[^<]?[^<]?[^<]?[^<]?[^<]?", RE.MATCH_CASEINDEPENDENT | RE.MATCH_MULTILINE );

        //ok... we should then compute the percentage of junk data we have.

        index = 0;

        int matchCount = 0; //total number of characters matched.

        debug( "CONTENT: " );

        debug( content );
        
        //try to match a lot of content
        while ( regexp.match( content, index ) ) {

            debug( "match was: " + regexp.getParen( 0 ) );
            
            matchCount += regexp.getParenEnd( 0 ) - regexp.getParenStart( 0 );

            index = regexp.getParenEnd( 1 );
            
        } 

        double percentage =  ( (double)matchCount / content.length() ) * 100;

        debug( "percentage: " + percentage );
        
        return  percentage > MIN_JUNK_DATA_PERCENTAGE;

    }

    /**
     * Get the mode we are operating in.
     * 
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public int getMode() {

        return this.mode;
        
    }

    /**
     * Set minimal mode and all options.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public void setModeMinimal() {

        this.mode = MODE_MINIMAL;
        
    }
    
    /**
     * Cleanse pcdata of junk.  This includes comments, etc.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String cleanseHTML( String html ) throws Exception {

        html = delete( "<!--", "--[ ]?/?>", html );

        //FIXME: this will break on <script/>
        html = delete( "<script", "</script>", html );
        html = delete( "<style", "</style>", html );
        html = delete( "<style", "/>", html );

        return html;
        
    }

    /**
     * Delete the region between the two regexps and return the two strings.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String delete( String begin_regexp,
                          String end_regexp,
                          String pcdata ) throws Exception {

        RE first = new RE( begin_regexp );
        RE second = new RE( end_regexp );

        int index = 0;

        while ( first.match( pcdata, index ) ) {

            int begin = index;
            index = first.getParenStart( 0 );
            
            if ( second.match( pcdata, index ) ) {

                pcdata = pcdata.substring( 0, first.getParenStart( 0 ) ) +
                         pcdata.substring( second.getParenEnd( 0 ), pcdata.length() );
                
            } else {

                pcdata = pcdata.substring( 0, first.getParenEnd( 0 ) );
                break;
                
            }

        }

        return pcdata;

    }
    
    /**
     * Clean up PCDATA so it is better for RSS
     *
     * - remove fonts
     *
     * - remove <small>
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String cleansePCDATA( String pcdata ) throws Exception {

        RE regexp = new RE( "</?font[^>]*>", RE.MATCH_CASEINDEPENDENT );

        pcdata = regexp.subst( pcdata, "" );
        
        return pcdata;

    }

    /**
     * Get the length of all the stripped content.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public int getContentStrippedLength() {

        //FIXME: this is throwing an NPE here!!!
        
        PCDATASection[] sections = getPCDATASections();

        int length = 0;
        
        for ( int i = 0; i < sections.length; ++i ) {

            String stripped = sections[i].stripped;

            if ( stripped != null ) {

                length += stripped.length();

            } 
            
        } 

        return length;
        
    }

    /**
     * Return true if this is an acceptable PCDATASection.  This is done by
     * analyzing the text and figuring out if we can actually use this.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public boolean isAcceptablePCDATA( PCDATASection section ) {

        return section.stripped != null && section.stripped.length() > MIN_CDATA_LENGTH;

    }

    /**
     * Return true if the given local_name is a holder than can format HTML
     * across a paragraph.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public boolean isHolderElement( String local_name ) {

        return local_name.equals( "p" ) || local_name.equals( "br" );
        
    }
    
    /**
     * 
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public void setInitialized( boolean initialized ) {

        this.initialized = initialized;
        
    }

    /**
     * 
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public boolean getInitialized() {

        return this.initialized ;
        
    }

    /**
     * Truncate the given value so that 
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String truncate( String value, int length ) {

        //make sure we don't have TOO long of a description
        if ( value.length() >= length ) {

            //FIXME: to go end of word break!  no "hel..." instead do "hello ..."
            value = value.substring( 0, length ) + "...";
            
        } 

        return value;

    }

    /**
     * Attempt to pull out the title from the given description
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String getTitle( String description ) throws Exception {

        //FIXME: what about sentences with abbreviasions?

        //FIXME: the following description will fail.

        // WASHINGTON -- The five New York men accused of supporting terrorism
        //trained at a camp in Afghanistan run by Usama bin Laden's Al Qaeda
        //network, according to U.S. authorities.
        
        //Example: This vs. That. 
        //RE regexp = new RE( "[.?!;:] " );

        //we can't include ; because I don't want to match against &amp;
        RE regexp = new RE( "[.?!:] " );

        int end = 0;

        while ( regexp.match( description, end ) && end < MIN_TITLE_WIDTH ) {

            if ( regexp.getParenEnd( 0 ) > MAX_TITLE_WIDTH ) {
                break;
            } 

            end = regexp.getParenEnd( 0 );
            
        } 

        if ( end == 0 ) {

            end = MAX_TITLE_WIDTH;
            
        } 

        //if the title STILL can't be guessed try to use a substring.

        String title = description.substring( 0, end );

        title = truncate ( title, MAX_TITLE_WIDTH );

        return cleanseTitle( title  );
        
    }

    /**
     * Get the minimum amount of content we need befoe another repass
     *
     * The minimum amount of content we need to do a second pass with td, br
     * elements.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public int getMinRepassContentLength() throws Exception {

        String ripped = this.html;

        RE regexp = new RE( "<[^>]+>", RE.MATCH_CASEINDEPENDENT  );
        ripped = regexp.subst( ripped, "" );

        //now normalize spaces...
        regexp = new RE( "[ ]+", RE.MATCH_CASEINDEPENDENT );
        ripped = regexp.subst( ripped, " " ).trim();

        //tabs
        regexp = new RE( "[\t]+", RE.MATCH_CASEINDEPENDENT );
        ripped =  regexp.subst( ripped, " " ).trim();

        //blank lines to \n
        regexp = new RE( "^[ ]$",RE.MATCH_MULTILINE );
        ripped =  regexp.subst( ripped, "" );

        //now normalize \n
        regexp = new RE( "[\n]+", RE.MATCH_CASEINDEPENDENT | RE.MATCH_MULTILINE );
        ripped =  regexp.subst( ripped, "\n" ).trim();

        new FileOutputStream( "out.log" ).write( ripped.getBytes() );

        //a little less than 1/2
        return (int)(ripped.length() * .4);
        
    }

    public String cleanseEntities( String data ) throws  Exception {

        //remove all entities accept the following

        //RE regexp = new RE( "&raquo;" );
        //data = regexp.subst( data, "" );

        return data;
        
    }
    
    /**
     * Cleanse all leading non-alpha-numeric chars.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private String cleanseLeadingGarbage( String data ) throws Exception {

        //we can't have & or ;

        //FIXME: does this regexp noT woRk right... is it too inclusive?

        //RE regexp = new RE( "^[!#%()<>@-]+" );
        //data = regexp.subst( data, "" );

        return data.trim();

    }
    
    /**
     * Cleans leading dates from data.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    private String cleanseDate( String data ) throws Exception {

        //get rid of 200x based dates...

        RE regexp = new RE( "^.?.?.?.?.?.?.?.?.?.?.?.?.?.?.?20[01][0-9]" );
        data = regexp.subst( data, "" );

        return data.trim();

    }
    
    /**
     * Cleanse a title so that it can be represented correctly..
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    public String cleanseTitle( String title ) throws Exception {

        return cleanseEntities( cleanseLeadingGarbage( cleanseDate( title ) ) );
        
    }
    
    /**
     * PCData result.  Store stripped and pcdata.
     *
     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
     */
    class PCDATASection {

        /**
         * PCData as a string
         */
        public String pcdata = null;

        /**
         * Stripped PCData
         */
        public String stripped = null;

        /**
         * Beginning of the HTML where we found this data.
         */
        public int begin = 0;

        /**
         * End of the HTML where we found this data.
         */
        public int end = 0;
        
        /**
         * 
         * Create a new <code>RSSContentSerializer</code> instance.
         *
         * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
         */
        public PCDATASection( String pcdata,
                              String stripped,
                              int begin,
                              int end ) {

            this.pcdata = pcdata;
            this.stripped = stripped;
            this.begin = begin;
            this.end = end;
            
        }

    }

}
