package com.rapidminer.operator.extraction;

import com.rapidminer.example.Statistics;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.UserError;
import edu.udo.cs.wvtool.config.WVTConfiguration;
import edu.udo.cs.wvtool.generic.inputfilter.TextInputFilter;
import edu.udo.cs.wvtool.main.WVTDocumentInfo;
import edu.udo.cs.wvtool.main.WVTool;
import edu.udo.cs.wvtool.util.WVToolException;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.Iterator;
import java.util.LinkedList;
import org.jaxen.JaxenException;
import org.jdom.CDATA;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.filter.Filter;
import org.jdom.input.SAXBuilder;

/* loaded from: input_file:WEB-INF/lib/rapidminer-plugintext-1.0.0.jar:com/rapidminer/operator/extraction/TextExtractionWrapper.class */
public class TextExtractionWrapper {
    public static final int CONTENT_TYPE_TEXT = 0;
    public static final int CONTENT_TYPE_XML = 1;
    public static final int CONTENT_TYPE_HTML = 2;
    public static final int CONTENT_TYPE_PDF = 3;
    private boolean ignoreCDATA;
    private String content;
    private int contentType;
    private Document dom = null;

    public TextExtractionWrapper(String str, int i, boolean z) {
        this.ignoreCDATA = true;
        this.ignoreCDATA = z;
        this.content = str;
        this.contentType = i;
    }

    public TextExtractionWrapper(InputStream inputStream, int i, boolean z) throws ExtractionException {
        this.ignoreCDATA = true;
        try {
            this.ignoreCDATA = z;
            this.contentType = i;
            if (i == 1 || i == 2) {
                readXMLBasedDocument(inputStream, null);
            } else {
                readTextBasedDocument(new InputStreamReader(inputStream));
            }
        } catch (IOException e) {
            throw new ExtractionException("", e, new UserError((Operator) null, 302, Statistics.UNKNOWN, e));
        }
    }

    public TextExtractionWrapper(File file, int i, boolean z) throws ExtractionException {
        this.ignoreCDATA = true;
        try {
            this.ignoreCDATA = z;
            this.contentType = i;
            if (i == 1 || i == 2) {
                readXMLBasedDocument(new FileInputStream(file), null);
            }
            readTextBasedDocument(new InputStreamReader(new FileInputStream(file)));
        } catch (IOException e) {
            throw new ExtractionException("", e, new UserError((Operator) null, 302, Statistics.UNKNOWN, e));
        }
    }

    public TextExtractionWrapper(WVTDocumentInfo wVTDocumentInfo, WVTConfiguration wVTConfiguration, boolean z) throws ExtractionException {
        this.ignoreCDATA = true;
        WVTool wVTool = new WVTool(false);
        this.ignoreCDATA = z;
        this.contentType = 0;
        switch (WVTConfiguration.determineType(wVTDocumentInfo)) {
            case 0:
                this.contentType = 0;
                break;
            case 1:
                this.contentType = 1;
                break;
            case 2:
                this.contentType = 2;
                break;
            default:
                this.contentType = 0;
                break;
        }
        try {
            if (this.contentType == 1 || this.contentType == 2 || (wVTConfiguration.getComponentForStep("inputfilter", wVTDocumentInfo) instanceof ExtractingInputFilter)) {
                readXMLBasedDocument(wVTool.getInputStream(wVTDocumentInfo, wVTConfiguration), wVTDocumentInfo);
                readTextBasedDocument(new TextInputFilter().convertToPlainText(wVTool.getInputStream(wVTDocumentInfo, wVTConfiguration), wVTDocumentInfo));
            } else if (wVTConfiguration.getComponentForStep("inputfilter", wVTDocumentInfo) instanceof ExtractingInputFilter) {
                readTextBasedDocument(new TextInputFilter().convertToPlainText(wVTool.getInputStream(wVTDocumentInfo, wVTConfiguration), wVTDocumentInfo));
            } else {
                readTextBasedDocument(wVTool.getReader(wVTDocumentInfo, wVTConfiguration));
            }
        } catch (WVToolException e) {
            throw new ExtractionException("", e, new UserError((Operator) null, 306, "WVTool", e));
        } catch (IOException e2) {
            throw new ExtractionException("", e2, new UserError((Operator) null, 302, wVTDocumentInfo.getSourceName(), e2));
        }
    }

    public Iterator<String> getValues(TextExtractor textExtractor) throws ExtractionException {
        return textExtractor instanceof RegexExtractor ? getValues((RegexExtractor) textExtractor) : getValues((XPathExtractor) textExtractor);
    }

    public Iterator<String> getValues(RegexExtractor regexExtractor) {
        if (this.content == null) {
            this.content = this.dom.toString();
        }
        return regexExtractor.findPatterns(this.content);
    }

    public Iterator<String> getValues(XPathExtractor xPathExtractor) throws ExtractionException {
        try {
            if (this.dom == null) {
                this.dom = (this.contentType == 2 ? new TagSoupSAXBuilder() : new SAXBuilder()).build(new StringReader(this.content));
            }
            Iterator<String> findPatterns = xPathExtractor.findPatterns(this.dom);
            return findPatterns != null ? findPatterns : new LinkedList().iterator();
        } catch (JaxenException e) {
            throw new ExtractionException("", e, new UserError((Operator) null, 401, e));
        } catch (JDOMException e2) {
            throw new ExtractionException("", e2, new UserError((Operator) null, 401, e2));
        } catch (IOException e3) {
            throw new ExtractionException("", e3, new UserError((Operator) null, 302, Statistics.UNKNOWN, e3));
        }
    }

    private void readXMLBasedDocument(InputStream inputStream, WVTDocumentInfo wVTDocumentInfo) throws ExtractionException, IOException {
        if (this.contentType == 2 || this.contentType == 1) {
            try {
                if (this.dom == null) {
                    if (this.contentType != 2) {
                        this.dom = new SAXBuilder().build(inputStream);
                        return;
                    }
                    TagSoupSAXBuilder tagSoupSAXBuilder = new TagSoupSAXBuilder();
                    if (wVTDocumentInfo != null) {
                        this.dom = tagSoupSAXBuilder.build(new TextInputFilter().convertToPlainText(inputStream, wVTDocumentInfo));
                    } else {
                        this.dom = tagSoupSAXBuilder.build(inputStream);
                    }
                    if (this.ignoreCDATA) {
                        Iterator descendants = this.dom.getDescendants(new Filter() { // from class: com.rapidminer.operator.extraction.TextExtractionWrapper.1
                            public boolean matches(Object obj) {
                                return obj instanceof Element;
                            }
                        });
                        while (descendants.hasNext()) {
                            ((Element) descendants.next()).removeContent(new Filter() { // from class: com.rapidminer.operator.extraction.TextExtractionWrapper.2
                                public boolean matches(Object obj) {
                                    return obj instanceof CDATA;
                                }
                            });
                        }
                    }
                }
            } catch (JDOMException e) {
                throw new ExtractionException("", e, new UserError((Operator) null, 401, e));
            }
        }
    }

    private void readTextBasedDocument(Reader reader) throws ExtractionException, IOException {
        StringBuffer stringBuffer = new StringBuffer();
        BufferedReader bufferedReader = new BufferedReader(reader);
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                bufferedReader.close();
                this.content = stringBuffer.toString();
                return;
            } else {
                stringBuffer.append(readLine);
                stringBuffer.append("\n");
            }
        }
    }

    public static int determineType(File file) {
        String name = file.getName();
        int lastIndexOf = name.lastIndexOf(46);
        return determineType(lastIndexOf >= 0 ? name.substring(lastIndexOf + 1) : "");
    }

    public static int determineType(String str) {
        if (str.equalsIgnoreCase("htm")) {
            str = "html";
        }
        if (str.equalsIgnoreCase("pdf")) {
            return 3;
        }
        if (str.equalsIgnoreCase("html")) {
            return 2;
        }
        return str.equalsIgnoreCase("xml") ? 1 : 0;
    }
}
