/*
 * Decompiled with CFR 0.152.
 */
package org.openprivacy.reptile;

import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.Enumeration;
import java.util.Vector;
import org.apache.regexp.RE;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.Namespace;
import org.jdom.output.XMLOutputter;
import org.openprivacy.reptile.WellFormedContentParser;
import org.openprivacy.reptile.xml.XMLStringCleanser;
import talon.util.InputStreamUtils;

public class RSSContentSerializer {
    public static final String COMPONENT_VERSION = "1032684403";
    public static final String USER_AGENT_STRING = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.1) Gecko/20020826";
    public static final int MIN_TITLE_WIDTH = 20;
    public static final int MAX_TITLE_WIDTH = 200;
    public static final int MIN_JUNK_DATA_PERCENTAGE = 80;
    public static final boolean INCLUDE_ACCEPTABLE_WITHIN_FIRSTLEVEL = false;
    public static final int MODE_MINIMAL = 0;
    public static final int MODE_FLEXIBLE = 1;
    public static final int MODE_ANCHOR = 2;
    public static final int MIN_DESCRIPTION_LENGTH = 500;
    public static final int MAX_DESCRIPTION_LENGTH = 900;
    public static final int MIN_CDATA_LENGTH = 50;
    public static final boolean DEBUG = false;
    public static final boolean INCLUDE_FORMS = false;
    private String site = null;
    private String html = null;
    private String title = null;
    private String description = "";
    private Vector vpcdata = new Vector();
    private String resource = null;
    private StringBuffer content = new StringBuffer();
    private Vector firstLevelElements = new Vector();
    private Vector acceptableInnerElements = new Vector();
    private boolean foundNonJunkContent = false;
    private int mode = 0;
    private String anchorName = null;
    private boolean initialized = false;

    public RSSContentSerializer() {
        this.firstLevelElements.addElement("p");
        this.firstLevelElements.addElement("pre");
        this.firstLevelElements.addElement("h1");
        this.firstLevelElements.addElement("h2");
        this.firstLevelElements.addElement("h3");
        this.firstLevelElements.addElement("h4");
        this.firstLevelElements.addElement("blockquote");
        this.firstLevelElements.addElement("ul");
        this.firstLevelElements.addElement("ol");
        this.firstLevelElements.addElement("dl");
        this.firstLevelElements.addElement("b");
        this.firstLevelElements.addElement("strong");
        this.firstLevelElements.addElement("span");
        this.addAcceptableInnerElement("b");
        this.addAcceptableInnerElement("img");
        this.addAcceptableInnerElement("a");
        this.addAcceptableInnerElement("i");
        this.addAcceptableInnerElement("font");
        this.addAcceptableInnerElement("blockquote");
        this.addAcceptableInnerElement("span");
        this.addAcceptableInnerElement("div");
        this.addAcceptableInnerElement("em");
        this.addAcceptableInnerElement("br");
        this.addAcceptableInnerElement("ul");
        this.addAcceptableInnerElement("ol");
        this.addAcceptableInnerElement("li");
        this.addAcceptableInnerElement("pre");
        this.addAcceptableInnerElement("dl");
        this.addAcceptableInnerElement("dd");
        this.addAcceptableInnerElement("dt");
        this.addAcceptableInnerElement("code");
        this.addAcceptableInnerElement("ins");
        this.addAcceptableInnerElement("del");
        this.addAcceptableInnerElement("q");
        this.addAcceptableInnerElement("quote ");
        this.addAcceptableInnerElement("strong");
        this.addAcceptableInnerElement("abbr");
        this.addAcceptableInnerElement("acronym");
        this.addAcceptableInnerElement("cite");
        this.addAcceptableInnerElement("samp");
        this.addAcceptableInnerElement("sub");
        this.addAcceptableInnerElement("sup");
        this.addAcceptableInnerElement("u");
        this.addAcceptableInnerElement("nitf");
        this.addAcceptableInnerElement("xmp");
        this.addAcceptableInnerElement("var");
        this.addAcceptableInnerElement("kbd");
        this.addAcceptableInnerElement("dfn");
        this.addAcceptableInnerElement("big");
        this.addAcceptableInnerElement("tt");
        this.addAcceptableInnerElement("strike ");
        this.addAcceptableInnerElement("s");
        this.addAcceptableInnerElement("br");
        this.addAcceptableInnerElement("p");
        this.addAcceptableInnerElement("h1");
        this.addAcceptableInnerElement("h2");
        this.addAcceptableInnerElement("h3");
        this.addAcceptableInnerElement("h4");
        this.addAcceptableInnerElement("nobr");
        this.addAcceptableInnerElement("wbr");
        this.addAcceptableInnerElement("address");
        this.addAcceptableInnerElement("fieldset");
        this.addAcceptableInnerElement("legend");
    }

    public String getHTML() {
        return this.html;
    }

    public void setHTML(String html) {
        this.html = html;
    }

    public String getResourceAsString() throws Exception {
        URL url = new URL(this.resource);
        URLConnection conn = url.openConnection();
        conn.setRequestProperty("User-Agent", USER_AGENT_STRING);
        String contentType = conn.getContentType();
        if (contentType != null && contentType.indexOf("text/html") == -1) {
            throw new Exception("Only HTML content is supported and the following content type was detected: " + conn.getContentType());
        }
        InputStream is = conn.getInputStream();
        this.resource = conn.getURL().toExternalForm();
        return InputStreamUtils.toString((InputStream)is);
    }

    public String getTitle() {
        return this.title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getDescription() {
        try {
            this.description = this.cleanseEntities(this.cleanseLeadingGarbage(this.cleanseDate(this.description)));
        }
        catch (Exception e) {}
        return this.description;
    }

    public void setDescription(String description) {
        this.description = description;
    }

    public String getResource() {
        return this.resource;
    }

    public void setResource(String resource) {
        if (resource.indexOf("#") != -1) {
            this.anchorName = resource.substring(resource.indexOf("#") + 1, resource.length());
            this.resource = resource.substring(0, resource.indexOf("#"));
            this.debug("anchorName: " + this.anchorName);
            this.debug("resource: " + this.resource);
        } else {
            this.resource = resource;
        }
    }

    private void resetParse() {
        this.description = "";
        this.vpcdata = new Vector();
        this.content = new StringBuffer();
        this.firstLevelElements = new Vector();
        this.foundNonJunkContent = false;
    }

    private void parseSecondary() throws Exception {
        this.debug("=================================================================");
        this.debug("RESORTING TO SECONDARY PARSE");
        this.debug("=================================================================");
        this.resetParse();
        this.firstLevelElements.addElement("td");
        this.firstLevelElements.addElement("br");
        this.firstLevelElements.addElement("div");
        this.parseHTML(this.html);
    }

    private void parseAcceptableInnerElements() throws Exception {
        this.debug("=================================================================");
        this.debug("RESORTING TO acceptableInnerElements PARSE");
        this.debug("=================================================================");
        this.resetParse();
        Enumeration enumeration = this.acceptableInnerElements.elements();
        while (enumeration.hasMoreElements()) {
            this.firstLevelElements.addElement(enumeration.nextElement());
        }
        this.parseHTML(this.html);
    }

    private void parseAnchorMode() throws Exception {
        int end;
        int begin;
        this.debug("=================================================================");
        this.debug("ANCHOR MODE PARSE");
        this.debug("=================================================================");
        RE regexp = new RE("<a name=\"" + this.anchorName + "\"", 3);
        int parseIndex = 0;
        if (regexp.match(this.html)) {
            begin = regexp.getParenStart(0);
            RE endr = new RE("<a name=", 1);
            end = -1;
            if (endr.match(this.html, regexp.getParenEnd(0))) {
                end = endr.getParenStart(0);
            } else {
                endr = new RE("<([^/> ]+)", 1);
                parseIndex = regexp.getParenEnd(0);
                while (endr.match(this.html, parseIndex)) {
                    parseIndex = endr.getParenEnd(0);
                    String local_name = endr.getParen(1).toLowerCase();
                    if (this.isAcceptableInnerElement(local_name) || this.isFirstLevelElement(local_name)) continue;
                    end = endr.getParenStart(0);
                    break;
                }
            }
        } else {
            throw new Exception("Anchor not found: " + this.anchorName);
        }
        String pcdata = this.cleansePCDATA(this.stripnbsp(this.html.substring(begin, end)));
        String stripped = this.strip(pcdata);
        this.debug("SUCCESS:");
        this.debug(pcdata);
        this.vpcdata.addElement(new PCDATASection(pcdata, stripped, begin, end));
        this.content.append(pcdata);
    }

    public void init() throws Exception {
        this.html = this.getResourceAsString();
        this.html = this.cleanseHTML(this.html);
        this.initialized = true;
    }

    public void parse() throws Exception {
        if (!this.initialized) {
            this.init();
        }
        int minRepassContentLength = this.getMinRepassContentLength();
        if (this.anchorName != null) {
            this.parseAnchorMode();
        } else {
            this.parseHTML(this.html);
            if (this.getContentStrippedLength() < minRepassContentLength) {
                this.parseSecondary();
            }
            if (this.getContentStrippedLength() < minRepassContentLength) {
                this.parseAcceptableInnerElements();
            }
        }
        if (this.description.length() < 500) {
            PCDATASection[] sections = this.getPCDATASections();
            int index = 0;
            while (index < sections.length && this.description.length() < 500) {
                String stripped = sections[index].stripped;
                if (stripped != null && this.description.indexOf(stripped) == -1) {
                    this.description = this.description + " " + stripped;
                }
                ++index;
            }
        }
        this.description = this.truncate(this.description, 900);
        this.content = new StringBuffer(WellFormedContentParser.parse(this.relativize(this.content.toString())));
        this.debug("Found the following number of PCDATA sections: " + this.vpcdata.size());
        this.debug("Content size: " + this.getContent().length());
        this.debug("Content Stripped Length: " + this.getContentStrippedLength());
        this.debug("Description size: " + this.description.length());
        this.debug("Min Repass Content Length: " + minRepassContentLength);
    }

    public String getRSS() throws Exception {
        Namespace content_ns = Namespace.getNamespace((String)"content", (String)"http://purl.org/rss/1.0/modules/content/");
        Namespace rdf_ns = Namespace.getNamespace((String)"rdf", (String)"http://www.w3.org/1999/02/22-rdf-syntax-ns#");
        Namespace rss_ns = Namespace.getNamespace((String)"rss", (String)"http://purl.org/rss/1.0/");
        Namespace ag_ns = Namespace.getNamespace((String)"ag", (String)"http://purl.org/rss/modules/aggregation/");
        Namespace link_ns = Namespace.getNamespace((String)"link", (String)"http://purl.org/rss/modules/link/");
        Element rdf = new Element("RDF", rdf_ns);
        Element channel = new Element("channel", rss_ns);
        channel.setAttribute("about", this.getSite(), rdf_ns);
        rdf.addContent(channel);
        String link = this.resource;
        if (this.anchorName != null) {
            link = link + "#" + this.anchorName;
        }
        if (this.title == null) {
            channel.addContent(new Element("title", rss_ns).setText("content for: " + this.resource));
        } else {
            channel.addContent(new Element("title", rss_ns).setText(this.title));
        }
        channel.addContent(new Element("link", rss_ns).setText(this.getSite()));
        if (this.description == null) {
            channel.addContent(new Element("description", rss_ns).setText("Serialized content for the following URL: " + this.resource));
        } else {
            channel.addContent(new Element("description", rss_ns).setText(this.description));
        }
        channel.addContent(new Element("items", rss_ns).addContent(new Element("Seq", rdf_ns).addContent(new Element("li", rdf_ns).setAttribute("resource", this.resource, rdf_ns))));
        Element item = new Element("item", rss_ns);
        item.setAttribute("about", this.resource, rdf_ns);
        rdf.addContent(item);
        if (this.title == null) {
            item.addContent(new Element("title", rss_ns).setText(this.resource));
        } else {
            item.addContent(new Element("title", rss_ns).setText(this.title));
        }
        item.addContent(new Element("link", rss_ns).setText(link));
        if (this.description == null) {
            item.addContent(new Element("description", rss_ns).setText(this.resource));
        } else {
            item.addContent(new Element("description", rss_ns).setText(this.description));
        }
        Element encoded = new Element("encoded", content_ns);
        encoded.setText(XMLStringCleanser.cleanse(this.content.toString()));
        item.addContent(encoded);
        String encoding = "ISO-8859-1";
        XMLOutputter outputter = new XMLOutputter("    ", true, encoding);
        outputter.setEncoding(encoding);
        outputter.setOmitDeclaration(false);
        outputter.setOmitEncoding(false);
        return outputter.outputString(new Document(rdf));
    }

    private void parseHTML(String html) throws Exception {
        String local_name = "";
        String pcdata = "";
        RE firstr = new RE("<([^/> ]+)", 1);
        int parseIndex = 0;
        while (firstr.match(html, parseIndex)) {
            local_name = firstr.getParen(1).toLowerCase();
            parseIndex = firstr.getParenEnd(0);
            if (local_name.equals("title") && this.title == null) {
                String title = html.substring(parseIndex + 1, html.indexOf("</", parseIndex));
                this.title = this.cleanseTitle(this.strip(title));
            }
            if (!this.isFirstLevelElement(local_name)) continue;
            PCDATASection result = this.getPCDATA(html, local_name, parseIndex);
            if (parseIndex > 0) {
                parseIndex = result.end;
            }
            pcdata = result.pcdata;
            String stripped = result.stripped;
            String holder_start = "<" + local_name + ">";
            String holder_end = "</" + local_name + ">";
            if (!this.isHolderElement(local_name)) {
                holder_start = "<p><" + local_name + ">";
                holder_end = "</" + local_name + "></p>";
            }
            if (this.isAcceptablePCDATA(result) && (!this.isJunkContent(pcdata) || this.foundNonJunkContent)) {
                this.debug("ACCEPTED: " + local_name);
                this.foundNonJunkContent = true;
                this.content.append("<!-- BEGIN PCDATA SECTION ");
                this.content.append("begin: " + result.begin + " ");
                this.content.append("end: " + result.end + " ");
                this.content.append("pcdata-length: " + result.pcdata.length() + " ");
                this.content.append("stripped-length: " + result.stripped.length());
                this.content.append(" -->\n");
                this.content.append("\n" + holder_start + "\n");
                this.content.append(pcdata);
                this.content.append("\n" + holder_end + "\n");
                if (this.description.length() >= 500) continue;
                this.description = this.description + " " + stripped;
                this.debug("New description is: " + this.description.length());
                this.debug(this.description);
                continue;
            }
            if (stripped == null || stripped.length() <= 0) continue;
            this.debug("REJECTED: The pcdata was rejected as it was not found acceptable. - " + stripped.length() + ": ");
            this.debug("----");
            this.debug("PCDATA: ");
            this.debug(pcdata);
            this.debug("STRIPPED: ");
            this.debug(stripped);
            this.debug("----");
        }
    }

    private PCDATASection getPCDATA(String html, String local_name, int parseIndex) throws Exception {
        RE eer = new RE(">");
        if (eer.match(html, parseIndex)) {
            parseIndex = eer.getParenEnd(0);
            String current_local_name = "";
            int begin = parseIndex;
            Object result = null;
            RE regexp = new RE("</?([^/> ]+)([ ]?/>)?");
            int nesting = 1;
            while (regexp.match(html, parseIndex)) {
                parseIndex = regexp.getParenEnd(0);
                current_local_name = regexp.getParen(1).toLowerCase();
                RE selfTerminatingRegexp = new RE("/>$");
                if (selfTerminatingRegexp.match(regexp.getParen(0))) continue;
                RE terminatingRegexp = new RE("^</");
                nesting = terminatingRegexp.match(regexp.getParen(0)) ? --nesting : ++nesting;
                if (!this.foundAllPCDATA(local_name, current_local_name, nesting)) continue;
                this.debug("local_name: " + local_name);
                this.debug("current_local_name: " + current_local_name);
                int end = regexp.getParenStart(0);
                if (this.isAcceptableInnerElement(current_local_name)) {
                    end = regexp.getParenEnd(0) + 1;
                }
                String pcdata = this.cleansePCDATA(this.stripnbsp(html.substring(begin, end))).trim();
                return this.section(pcdata, begin, end);
            }
        }
        return null;
    }

    private boolean foundAllPCDATA(String local_name, String current_local_name, int nesting) {
        if (!this.isAcceptableInnerElement(current_local_name)) {
            return true;
        }
        if (nesting <= 0) {
            if (local_name.equals(current_local_name)) {
                return true;
            }
            if (this.isFirstLevelElement(current_local_name)) {
                return true;
            }
        }
        return false;
    }

    private PCDATASection section(String pcdata, int begin, int end) throws Exception {
        String stripped = this.strip(pcdata);
        if (stripped == null) {
            this.debug("FAILED");
            this.debug("begin: " + begin);
            this.debug("end: " + end);
            this.debug(pcdata);
            return new PCDATASection("", "", begin, end);
        }
        this.debug("SUCCESS on the following pcdata: ");
        this.debug("begin: " + begin);
        this.debug("end: " + end);
        this.debug(pcdata);
        this.debug("SUCCESS - stripped: ");
        this.debug(stripped);
        PCDATASection result = new PCDATASection(pcdata, stripped, begin, end);
        this.vpcdata.addElement(result);
        return result;
    }

    public String strip(String content) throws Exception {
        RE ematch = new RE("</?[^>]+>", 1);
        boolean matched = false;
        content = this.stripnbsp(content).trim();
        content = ematch.subst(content, " ").trim();
        if ((content = this.normalize(content)).equals("")) {
            content = null;
        }
        return content;
    }

    private String normalize(String content) throws Exception {
        RE regexp = new RE("[ ]+", 1);
        content = regexp.subst(content, " ").trim();
        regexp = new RE("[\n]+", 1);
        content = regexp.subst(content, "\n").trim();
        return content;
    }

    private String stripnbsp(String content) throws Exception {
        RE nbspr = new RE("&nbsp;");
        return nbspr.subst(content, " ");
    }

    private boolean isFirstLevelElement(String local_name) {
        return this.firstLevelElements.contains(local_name);
    }

    private boolean isAcceptableInnerElement(String current_local_name) {
        return this.acceptableInnerElements.contains(current_local_name);
    }

    private void addAcceptableInnerElement(String name) {
        this.acceptableInnerElements.addElement(name);
    }

    public String relativize(String content) throws Exception {
        String base = this.getBase();
        String site = this.getSite();
        StringBuffer buff = new StringBuffer();
        int index = 0;
        RE elementr = new RE("<(img|a)[^>]*>", 3);
        RE attributer = new RE("(src|href)=[\"']([^\"']+)[\"'][^>]*/?>", 3);
        while (elementr.match(content, index) && index != -1) {
            String local_name = elementr.getParen(1);
            int bound = elementr.getParenEnd(1);
            if (attributer.match(content, elementr.getParenStart(1))) {
                String slink = attributer.getParen(2);
                buff.append(content.substring(index, attributer.getParenStart(2)));
                slink = this.expand(slink);
                buff.append(slink);
                buff.append(content.substring(attributer.getParenEnd(2), attributer.getParenEnd(0)));
                index = attributer.getParenEnd(0);
                continue;
            }
            index = elementr.getParenEnd(0);
        }
        buff.append(content.substring(index, content.length()));
        return buff.toString();
    }

    private static void syntax() {
        System.out.println("SYNTAX: org.org.openprivacy.reptile.ContentServlet URL");
    }

    public static void main(String[] args) {
        if (args.length != 1) {
            RSSContentSerializer.syntax();
        } else {
            String resource = args[0];
            RSSContentSerializer rcs = new RSSContentSerializer();
            rcs.setResource(resource);
            try {
                rcs.parse();
                System.out.println(rcs.getRSS());
            }
            catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    public PCDATASection[] getPCDATASections() {
        Object[] result = new PCDATASection[this.vpcdata.size()];
        this.vpcdata.copyInto(result);
        return result;
    }

    private void debugAlways(String message) {
        System.err.println(message);
    }

    private void debug(String message) {
    }

    public String getContent() {
        return this.content.toString();
    }

    public String getBase() {
        int begin = "http://".length() + 1;
        int end = this.resource.lastIndexOf("/");
        if (end == -1 || end <= begin) {
            end = this.resource.length();
        }
        return this.resource.substring(0, end);
    }

    public String getSite() {
        if (this.site == null) {
            int end = this.resource.indexOf("/", 8);
            if (end == -1) {
                end = this.resource.length();
            }
            this.site = this.resource.substring(0, end);
        }
        return this.site;
    }

    public String expand(String link) throws Exception {
        if (link.startsWith("/")) {
            link = this.getSite() + link;
        } else if (link.startsWith("#")) {
            link = this.resource + link;
        } else if (link.startsWith("..")) {
            RE regexp = new RE("^\\.\\./?(.*)$", 1);
            RE baseRegexp = new RE("^(.*/)[^/]+/$", 1);
            String base = this.resource;
            while (regexp.match(link)) {
                link = regexp.getParen(1);
                if (!baseRegexp.match(base)) continue;
                base = baseRegexp.getParen(1);
            }
            link = base + link;
        } else if (!link.startsWith("http://")) {
            String base = this.getBase();
            link = base + "/" + link;
        }
        return link;
    }

    public boolean isJunkContent(String content) throws Exception {
        content = this.normalize(content);
        int index = 0;
        RE regexp = new RE("</?([a-zA-Z]+)[^>]+>", 3);
        while (regexp.match(content, index)) {
            String local_name = regexp.getParen(1).toLowerCase();
            if (!local_name.equals("a")) {
                content = content.substring(0, regexp.getParenStart(0)).trim() + content.substring(regexp.getParenEnd(0), content.length()).trim();
                index = regexp.getParenStart(0);
                continue;
            }
            index = regexp.getParenEnd(0);
        }
        regexp = new RE("\n");
        content = regexp.subst(content, "");
        regexp = new RE("> <");
        content = regexp.subst(content, "><");
        regexp = new RE("(<a [^>]+>[^<]+</a>)[^<]?[^<]?[^<]?[^<]?[^<]?", 3);
        index = 0;
        int matchCount = 0;
        this.debug("CONTENT: ");
        this.debug(content);
        while (regexp.match(content, index)) {
            this.debug("match was: " + regexp.getParen(0));
            matchCount += regexp.getParenEnd(0) - regexp.getParenStart(0);
            index = regexp.getParenEnd(1);
        }
        double percentage = (double)matchCount / (double)content.length() * 100.0;
        this.debug("percentage: " + percentage);
        return percentage > 80.0;
    }

    public int getMode() {
        return this.mode;
    }

    public void setModeMinimal() {
        this.mode = 0;
    }

    public String cleanseHTML(String html) throws Exception {
        html = this.delete("<!--", "--[ ]?/?>", html);
        html = this.delete("<script", "</script>", html);
        html = this.delete("<style", "</style>", html);
        html = this.delete("<style", "/>", html);
        return html;
    }

    public String delete(String begin_regexp, String end_regexp, String pcdata) throws Exception {
        RE first = new RE(begin_regexp);
        RE second = new RE(end_regexp);
        int index = 0;
        while (first.match(pcdata, index)) {
            int begin = index;
            index = first.getParenStart(0);
            if (second.match(pcdata, index)) {
                pcdata = pcdata.substring(0, first.getParenStart(0)) + pcdata.substring(second.getParenEnd(0), pcdata.length());
                continue;
            }
            pcdata = pcdata.substring(0, first.getParenEnd(0));
            break;
        }
        return pcdata;
    }

    public String cleansePCDATA(String pcdata) throws Exception {
        RE regexp = new RE("</?font[^>]*>", 1);
        pcdata = regexp.subst(pcdata, "");
        return pcdata;
    }

    public int getContentStrippedLength() {
        PCDATASection[] sections = this.getPCDATASections();
        int length = 0;
        int i = 0;
        while (i < sections.length) {
            String stripped = sections[i].stripped;
            if (stripped != null) {
                length += stripped.length();
            }
            ++i;
        }
        return length;
    }

    public boolean isAcceptablePCDATA(PCDATASection section) {
        return section.stripped != null && section.stripped.length() > 50;
    }

    public boolean isHolderElement(String local_name) {
        return local_name.equals("p") || local_name.equals("br");
    }

    public void setInitialized(boolean initialized) {
        this.initialized = initialized;
    }

    public boolean getInitialized() {
        return this.initialized;
    }

    public String truncate(String value, int length) {
        if (value.length() >= length) {
            value = value.substring(0, length) + "...";
        }
        return value;
    }

    public String getTitle(String description) throws Exception {
        RE regexp = new RE("[.?!:] ");
        int end = 0;
        while (regexp.match(description, end) && end < 20) {
            if (regexp.getParenEnd(0) > 200) break;
            end = regexp.getParenEnd(0);
        }
        if (end == 0) {
            end = 200;
        }
        String title = description.substring(0, end);
        title = this.truncate(title, 200);
        return this.cleanseTitle(title);
    }

    public int getMinRepassContentLength() throws Exception {
        String ripped = this.html;
        RE regexp = new RE("<[^>]+>", 1);
        ripped = regexp.subst(ripped, "");
        regexp = new RE("[ ]+", 1);
        ripped = regexp.subst(ripped, " ").trim();
        regexp = new RE("[\t]+", 1);
        ripped = regexp.subst(ripped, " ").trim();
        regexp = new RE("^[ ]$", 2);
        ripped = regexp.subst(ripped, "");
        regexp = new RE("[\n]+", 3);
        ripped = regexp.subst(ripped, "\n").trim();
        new FileOutputStream("out.log").write(ripped.getBytes());
        return (int)((double)ripped.length() * 0.4);
    }

    public String cleanseEntities(String data) throws Exception {
        return data;
    }

    private String cleanseLeadingGarbage(String data) throws Exception {
        return data.trim();
    }

    private String cleanseDate(String data) throws Exception {
        RE regexp = new RE("^.?.?.?.?.?.?.?.?.?.?.?.?.?.?.?20[01][0-9]");
        data = regexp.subst(data, "");
        return data.trim();
    }

    public String cleanseTitle(String title) throws Exception {
        return this.cleanseEntities(this.cleanseLeadingGarbage(this.cleanseDate(title)));
    }

    class PCDATASection {
        public String pcdata = null;
        public String stripped = null;
        public int begin = 0;
        public int end = 0;

        public PCDATASection(String pcdata, String stripped, int begin, int end) {
            this.pcdata = pcdata;
            this.stripped = stripped;
            this.begin = begin;
            this.end = end;
        }
    }
}

