/*
 * Decompiled with CFR 0.152.
 */
package com.rapidminer.operator;

import com.rapidminer.gui.wizards.PreviewListener;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.SegmenterPreviewerCreator;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.extraction.ExtractionException;
import com.rapidminer.operator.extraction.TextExtractionWrapper;
import com.rapidminer.operator.extraction.segmenter.DocumentSegmenter;
import com.rapidminer.operator.extraction.segmenter.DocumentSegmenterClass;
import com.rapidminer.operator.extraction.util.FeatureExtractionUtil;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeDirectory;
import com.rapidminer.parameter.ParameterTypePreview;
import com.rapidminer.parameter.ParameterTypeString;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import java.util.List;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class DocumentSegmenterOperator
extends Operator {
    public static final String PARAMETER_CONTENT_TYPE = "content_type";
    public static final String PARAMETER_OUTPUT = "output";
    public static final String PARAMETER_EXPRESSION = "expression";
    public static final String PARAMETER_IGNORE_CDATA = "ignore_cdata";

    public DocumentSegmenterOperator(OperatorDescription description) {
        super(description);
    }

    public IOObject[] apply() throws OperatorException {
        DocumentSegmenter segmenter = DocumentSegmenterClass.getSegmenterFromParameters(this.getParameters());
        File outDir = this.getParameterAsFile(PARAMETER_OUTPUT, true);
        int count = 0;
        File inDir = this.getParameterAsFile("texts");
        File[] files = inDir.listFiles();
        int i = 0;
        while (i < files.length) {
            if (files[i].isFile()) {
                try {
                    int type = this.isParameterSet(PARAMETER_CONTENT_TYPE) ? TextExtractionWrapper.determineType(this.getParameterAsString(PARAMETER_CONTENT_TYPE)) : TextExtractionWrapper.determineType(files[i]);
                    String suffix = null;
                    int index = files[i].getName().lastIndexOf(46);
                    suffix = index > -1 ? files[i].getName().substring(index + 1) : "txt";
                    Iterator<String> segments = segmenter.getSegments(files[i], type);
                    while (segments.hasNext()) {
                        String outFileName = String.valueOf(outDir.getAbsolutePath()) + File.separator + "seg" + count + "." + suffix;
                        try {
                            BufferedWriter out = new BufferedWriter(new FileWriter(outFileName));
                            out.write(segments.next());
                            ((Writer)out).close();
                        }
                        catch (IOException e) {
                            throw new UserError((Operator)this, 303, new Object[]{outFileName, e});
                        }
                        ++count;
                    }
                }
                catch (ExtractionException e) {
                    UserError error = e.getUserError();
                    error.setOperator((Operator)this);
                    throw error;
                }
            }
            ++i;
        }
        return new IOObject[0];
    }

    public Class<?>[] getInputClasses() {
        return new Class[0];
    }

    public Class<?>[] getOutputClasses() {
        return new Class[0];
    }

    public List<ParameterType> getParameterTypes() {
        List types = super.getParameterTypes();
        ParameterTypePreview previewType = new ParameterTypePreview(SegmenterPreviewerCreator.class, (PreviewListener)this);
        previewType.setExpert(false);
        types.add(previewType);
        types.add(new ParameterTypeDirectory("texts", "A directory containing the documents to be segmented", false));
        types.add(new ParameterTypeString(PARAMETER_CONTENT_TYPE, "The content type of the input texts (txt, xml, html)", true));
        types.add(new ParameterTypeDirectory(PARAMETER_OUTPUT, "The directory to which to write the segments", false));
        types.add(new ParameterTypeString(PARAMETER_EXPRESSION, "Specifies a regular expression or XPath expression that matches against substrings of the content which should be treated as individual segments. The syntax is the same as for attribute extraction (see WVTool operator), but instead of extracting only the first match, all matches are extracted and written to individual files", false));
        types.add(new ParameterTypeBoolean(PARAMETER_IGNORE_CDATA, "Specifies whether CDATA should be ignored when parsing HTML", true));
        types.add(FeatureExtractionUtil.createNamespaceParameter());
        return types;
    }
}

