/*
 * Decompiled with CFR 0.152.
 */
package com.rapidminer.operator.extraction;

import com.rapidminer.operator.UserError;
import com.rapidminer.operator.extraction.ExtractingInputFilter;
import com.rapidminer.operator.extraction.ExtractionException;
import com.rapidminer.operator.extraction.RegexExtractor;
import com.rapidminer.operator.extraction.TagSoupSAXBuilder;
import com.rapidminer.operator.extraction.TextExtractor;
import com.rapidminer.operator.extraction.XPathExtractor;
import edu.udo.cs.wvtool.config.WVTConfiguration;
import edu.udo.cs.wvtool.generic.inputfilter.TextInputFilter;
import edu.udo.cs.wvtool.main.WVTDocumentInfo;
import edu.udo.cs.wvtool.main.WVTool;
import edu.udo.cs.wvtool.util.WVToolException;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.Iterator;
import java.util.LinkedList;
import org.jaxen.JaxenException;
import org.jdom.CDATA;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.filter.Filter;
import org.jdom.input.SAXBuilder;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class TextExtractionWrapper {
    public static final int CONTENT_TYPE_TEXT = 0;
    public static final int CONTENT_TYPE_XML = 1;
    public static final int CONTENT_TYPE_HTML = 2;
    public static final int CONTENT_TYPE_PDF = 3;
    private boolean ignoreCDATA = true;
    private String content;
    private int contentType;
    private Document dom = null;

    public TextExtractionWrapper(String content, int contentType, boolean ignoreCDATA) {
        this.ignoreCDATA = ignoreCDATA;
        this.content = content;
        this.contentType = contentType;
    }

    public TextExtractionWrapper(InputStream inStream, int contentType, boolean ignoreCDATA) throws ExtractionException {
        try {
            this.ignoreCDATA = ignoreCDATA;
            this.contentType = contentType;
            if (contentType == 1 || contentType == 2) {
                this.readXMLBasedDocument(inStream, null);
            } else {
                this.readTextBasedDocument(new InputStreamReader(inStream));
            }
        }
        catch (IOException e) {
            throw new ExtractionException("", e, new UserError(null, 302, "unknown", e));
        }
    }

    public TextExtractionWrapper(File inFile, int contentType, boolean ignoreCDATA) throws ExtractionException {
        try {
            this.ignoreCDATA = ignoreCDATA;
            this.contentType = contentType;
            if (contentType == 1 || contentType == 2) {
                this.readXMLBasedDocument(new FileInputStream(inFile), null);
            }
            this.readTextBasedDocument(new InputStreamReader(new FileInputStream(inFile)));
        }
        catch (IOException e) {
            throw new ExtractionException("", e, new UserError(null, 302, "unknown", e));
        }
    }

    public TextExtractionWrapper(WVTDocumentInfo info, WVTConfiguration config, boolean ignoreCDATA) throws ExtractionException {
        WVTool wvtool = new WVTool(false);
        this.ignoreCDATA = ignoreCDATA;
        this.contentType = 0;
        int contentTypeWVTool = WVTConfiguration.determineType((WVTDocumentInfo)info);
        switch (contentTypeWVTool) {
            case 2: {
                this.contentType = 2;
                break;
            }
            case 1: {
                this.contentType = 1;
                break;
            }
            case 0: {
                this.contentType = 0;
                break;
            }
            default: {
                this.contentType = 0;
            }
        }
        try {
            if (this.contentType == 1 || this.contentType == 2 || config.getComponentForStep("inputfilter", info) instanceof ExtractingInputFilter) {
                this.readXMLBasedDocument(wvtool.getInputStream(info, config), info);
                TextInputFilter txtFilter = new TextInputFilter();
                Reader inReader = txtFilter.convertToPlainText(wvtool.getInputStream(info, config), info);
                this.readTextBasedDocument(inReader);
            } else if (!(config.getComponentForStep("inputfilter", info) instanceof ExtractingInputFilter)) {
                this.readTextBasedDocument(wvtool.getReader(info, config));
            } else {
                TextInputFilter txtFilter = new TextInputFilter();
                Reader inReader = txtFilter.convertToPlainText(wvtool.getInputStream(info, config), info);
                this.readTextBasedDocument(inReader);
            }
        }
        catch (IOException e2) {
            throw new ExtractionException("", e2, new UserError(null, 302, info.getSourceName(), e2));
        }
        catch (WVToolException e2) {
            throw new ExtractionException("", (Exception)((Object)e2), new UserError(null, 306, new Object[]{"WVTool", e2}));
        }
    }

    public Iterator<String> getValues(TextExtractor extr) throws ExtractionException {
        if (extr instanceof RegexExtractor) {
            return this.getValues((RegexExtractor)extr);
        }
        return this.getValues((XPathExtractor)extr);
    }

    public Iterator<String> getValues(RegexExtractor extr) {
        if (this.content == null) {
            this.content = this.dom.toString();
        }
        return extr.findPatterns(this.content);
    }

    public Iterator<String> getValues(XPathExtractor xpathExtractor) throws ExtractionException {
        Iterator<String> result = null;
        try {
            if (this.dom == null) {
                TagSoupSAXBuilder builder = null;
                builder = this.contentType == 2 ? new TagSoupSAXBuilder() : new SAXBuilder();
                this.dom = builder.build(new StringReader(this.content));
            }
            result = xpathExtractor.findPatterns(this.dom);
        }
        catch (JaxenException e) {
            throw new ExtractionException("", (Exception)((Object)e), new UserError(null, 401, new Object[]{e}));
        }
        catch (JDOMException e) {
            throw new ExtractionException("", (Exception)((Object)e), new UserError(null, 401, new Object[]{e}));
        }
        catch (IOException e) {
            throw new ExtractionException("", e, new UserError(null, 302, "unknown", e));
        }
        if (result != null) {
            return result;
        }
        return new LinkedList().iterator();
    }

    private void readXMLBasedDocument(InputStream inStream, WVTDocumentInfo docInfo) throws ExtractionException, IOException {
        if (this.contentType == 2 || this.contentType == 1) {
            try {
                if (this.dom == null) {
                    TagSoupSAXBuilder builder = null;
                    if (this.contentType == 2) {
                        builder = new TagSoupSAXBuilder();
                        if (docInfo != null) {
                            TextInputFilter txtFilter = new TextInputFilter();
                            this.dom = builder.build(txtFilter.convertToPlainText(inStream, docInfo));
                        } else {
                            this.dom = builder.build(inStream);
                        }
                        if (this.ignoreCDATA) {
                            Iterator it = this.dom.getDescendants(new Filter(){

                                public boolean matches(Object obj) {
                                    return obj instanceof Element;
                                }
                            });
                            while (it.hasNext()) {
                                ((Element)it.next()).removeContent(new Filter(){

                                    public boolean matches(Object obj) {
                                        return obj instanceof CDATA;
                                    }
                                });
                            }
                        }
                    } else {
                        builder = new SAXBuilder();
                        this.dom = builder.build(inStream);
                    }
                }
            }
            catch (JDOMException e) {
                throw new ExtractionException("", (Exception)((Object)e), new UserError(null, 401, new Object[]{e}));
            }
        }
    }

    private void readTextBasedDocument(Reader inReader) throws ExtractionException, IOException {
        StringBuffer contentBuf = new StringBuffer();
        BufferedReader in = new BufferedReader(inReader);
        String buf = null;
        while ((buf = in.readLine()) != null) {
            contentBuf.append(buf);
            contentBuf.append("\n");
        }
        in.close();
        this.content = contentBuf.toString();
    }

    public static int determineType(File f) {
        String sourceName = f.getName();
        String typeStr = "";
        int index = sourceName.lastIndexOf(46);
        if (index >= 0) {
            typeStr = sourceName.substring(index + 1);
        }
        return TextExtractionWrapper.determineType(typeStr);
    }

    public static int determineType(String typeStr) {
        if (typeStr.equalsIgnoreCase("htm")) {
            typeStr = "html";
        }
        if (typeStr.equalsIgnoreCase("pdf")) {
            return 3;
        }
        if (typeStr.equalsIgnoreCase("html")) {
            return 2;
        }
        if (typeStr.equalsIgnoreCase("xml")) {
            return 1;
        }
        return 0;
    }
}

