package eu.dnetlib.pace.util;

import com.google.common.collect.Lists;
import eu.dnetlib.data.transform.xml2.Utils;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.WfConfig;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.MapDocumentComparator;
import eu.dnetlib.pace.tree.support.TreeProcessor;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Queue;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/* loaded from: input_file:WEB-INF/lib/dnet-pace-core-4.0.0-20200210.114101-1.jar:eu/dnetlib/pace/util/BlockProcessor.class */
public class BlockProcessor {
    public static final List<String> accumulators = new ArrayList();
    private static final Log log = LogFactory.getLog(BlockProcessor.class);
    private DedupConfig dedupConf;

    public static void constructAccumulator(DedupConfig dedupConfig) {
        accumulators.add(String.format("%s::%s", dedupConfig.getWf().getEntityType(), "records per hash key = 1"));
        accumulators.add(String.format("%s::%s", dedupConfig.getWf().getEntityType(), "missing " + dedupConfig.getWf().getOrderField()));
        accumulators.add(String.format("%s::%s", dedupConfig.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConfig.getWf().getOrderField(), Integer.valueOf(dedupConfig.getWf().getGroupMaxSize()))));
        accumulators.add(String.format("%s::%s", dedupConfig.getWf().getEntityType(), "skip list"));
        accumulators.add(String.format("%s::%s", dedupConfig.getWf().getEntityType(), "dedupSimilarity (x2)"));
        accumulators.add(String.format("%s::%s", dedupConfig.getWf().getEntityType(), "d < " + dedupConfig.getWf().getThreshold()));
    }

    public BlockProcessor(DedupConfig dedupConfig) {
        this.dedupConf = dedupConfig;
    }

    public void processSortedBlock(String str, List<MapDocument> list, Reporter reporter) {
        if (list.size() > 1) {
            process(prepare(list), reporter);
        } else {
            reporter.incrementCounter(this.dedupConf.getWf().getEntityType(), "records per hash key = 1", 1L);
        }
    }

    public void process(String str, Iterable<MapDocument> iterable, Reporter reporter) {
        Queue<MapDocument> prepare = prepare(iterable);
        if (prepare.size() > 1) {
            process(simplifyQueue(prepare, str, reporter), reporter);
        } else {
            reporter.incrementCounter(this.dedupConf.getWf().getEntityType(), "records per hash key = 1", 1L);
        }
    }

    private Queue<MapDocument> prepare(Iterable<MapDocument> iterable) {
        PriorityQueue priorityQueue = new PriorityQueue(100, new MapDocumentComparator(this.dedupConf.getWf().getOrderField()));
        HashSet hashSet = new HashSet();
        int queueMaxSize = this.dedupConf.getWf().getQueueMaxSize();
        iterable.forEach(mapDocument -> {
            if (priorityQueue.size() <= queueMaxSize) {
                String identifier = mapDocument.getIdentifier();
                if (hashSet.contains(identifier)) {
                    return;
                }
                hashSet.add(identifier);
                priorityQueue.add(mapDocument);
            }
        });
        return priorityQueue;
    }

    private Queue<MapDocument> simplifyQueue(Queue<MapDocument> queue, String str, Reporter reporter) {
        LinkedList linkedList = new LinkedList();
        String str2 = "";
        ArrayList newArrayList = Lists.newArrayList();
        while (!queue.isEmpty()) {
            MapDocument remove = queue.remove();
            Field values = remove.values(this.dedupConf.getWf().getOrderField());
            if (values.isEmpty()) {
                reporter.incrementCounter(this.dedupConf.getWf().getEntityType(), "missing " + this.dedupConf.getWf().getOrderField(), 1L);
            } else {
                String cleanupForOrdering = NGramUtils.cleanupForOrdering(values.stringValue());
                if (cleanupForOrdering.equals(str2)) {
                    newArrayList.add(remove);
                } else {
                    populateSimplifiedQueue(linkedList, newArrayList, reporter, str2, str);
                    newArrayList.clear();
                    newArrayList.add(remove);
                    str2 = cleanupForOrdering;
                }
            }
        }
        populateSimplifiedQueue(linkedList, newArrayList, reporter, str2, str);
        return linkedList;
    }

    private void populateSimplifiedQueue(Queue<MapDocument> queue, List<MapDocument> list, Reporter reporter, String str, String str2) {
        WfConfig wf = this.dedupConf.getWf();
        if (list.size() < wf.getGroupMaxSize()) {
            queue.addAll(list);
        } else {
            reporter.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), Integer.valueOf(wf.getGroupMaxSize())), list.size());
        }
    }

    private void process(Queue<MapDocument> queue, Reporter reporter) {
        while (!queue.isEmpty()) {
            MapDocument remove = queue.remove();
            String identifier = remove.getIdentifier();
            WfConfig wf = this.dedupConf.getWf();
            Field values = remove.values(wf.getOrderField());
            if (((values == null || values.isEmpty()) ? null : values.stringValue()) != null) {
                Iterator<MapDocument> it = queue.iterator();
                while (true) {
                    if (it.hasNext()) {
                        MapDocument next = it.next();
                        String identifier2 = next.getIdentifier();
                        if (mustSkip(identifier2)) {
                            reporter.incrementCounter(wf.getEntityType(), "skip list", 1L);
                            break;
                        } else {
                            if (0 > wf.getSlidingWindowSize()) {
                                break;
                            }
                            Field values2 = next.values(wf.getOrderField());
                            String stringValue = (values2 == null || values2.isEmpty()) ? null : values2.stringValue();
                            if (!identifier2.equals(identifier) && stringValue != null) {
                                emitOutput(new TreeProcessor(this.dedupConf).compare(remove, next), identifier, identifier2, reporter);
                            }
                        }
                    }
                }
            }
        }
    }

    private void emitOutput(boolean z, String str, String str2, Reporter reporter) {
        if (!z) {
            reporter.incrementCounter(this.dedupConf.getWf().getEntityType(), "d < " + this.dedupConf.getWf().getThreshold(), 1L);
        } else {
            writeSimilarity(reporter, str, str2);
            reporter.incrementCounter(this.dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1L);
        }
    }

    private boolean mustSkip(String str) {
        return this.dedupConf.getWf().getSkipList().contains(getNsPrefix(str));
    }

    private String getNsPrefix(String str) {
        return StringUtils.substringBetween(str, "|", Utils.ID_SEPARATOR);
    }

    private void writeSimilarity(Reporter reporter, String str, String str2) {
        String entityType = this.dedupConf.getWf().getEntityType();
        reporter.emit(entityType, str, str2);
        reporter.emit(entityType, str2, str);
    }
}
