/*
 * Decompiled with CFR 0.152.
 */
package eu.openaire.publications_retriever.crawler;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.SetMultimap;
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlTypeChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MachineLearning {
    private static final Logger logger = LoggerFactory.getLogger(MachineLearning.class);
    public static final boolean useMLA = false;
    private static final float leastSuccessPercentageForMLA = 51.0f;
    private static int latestMLADocUrlsFound = 0;
    private static int urlsToGatherBeforeStarting = 5000;
    private static int leastNumOfUrlsToCheckBeforeAccuracyTest = 1000;
    private static int urlsToWaitUntilRestartMLA = 30000;
    private static boolean mlaStarted = false;
    private static int endOfSleepNumOfUrls = 0;
    private static int latestSuccessBreakPoint = 0;
    private static int latestUrlsMLAChecked = 0;
    public static final AtomicInteger timesGatheredData = new AtomicInteger(0);
    private static final AtomicInteger pageUrlsCheckedWithMLA = new AtomicInteger(0);
    private static boolean isInSleepMode = false;
    public static AtomicInteger totalPagesReachedMLAStage = new AtomicInteger(0);
    public static final SetMultimap<String, String> successPathsHashMultiMap = Multimaps.synchronizedSetMultimap(HashMultimap.create());
    public static final ConcurrentHashMap<String, String> successDocPathsExtensionHashMap = new ConcurrentHashMap();
    public static AtomicInteger docUrlsFoundByMLA = new AtomicInteger(0);
    private static final Set<String> domainsBlockedFromMLA = Collections.newSetFromMap(new ConcurrentHashMap());
    private static final ConcurrentHashMap<String, Integer> timesDomainsFailedInMLA;
    private static final int timesToFailBeforeBlockedFromMLA = 10;
    private static final List<Double> successRateList;

    public MachineLearning() {
        logger.debug("Initializing the MLA..");
        long approxNumOfTotalUrlsToCheck = LoaderAndChecker.useIdUrlPairs ? (long)((double)FileUtils.numOfLines * 0.7) : (long)((double)FileUtils.numOfLines * 0.85);
        logger.debug("\"approxNumOfTotalUrlsToCheck\" = " + approxNumOfTotalUrlsToCheck);
        int tenPercentOfInput = (int)((double)approxNumOfTotalUrlsToCheck * 0.1);
        if (urlsToGatherBeforeStarting > tenPercentOfInput) {
            urlsToGatherBeforeStarting = tenPercentOfInput;
        }
        logger.debug("\"urlsToGatherBeforeStarting\" = " + urlsToGatherBeforeStarting);
        int fivePercentOfInput = (int)((double)approxNumOfTotalUrlsToCheck * 0.05);
        if (leastNumOfUrlsToCheckBeforeAccuracyTest < fivePercentOfInput) {
            leastNumOfUrlsToCheckBeforeAccuracyTest = fivePercentOfInput;
        }
        logger.debug("\"leastNumOfUrlsToCheckBeforeAccuracyTest\" = " + leastNumOfUrlsToCheckBeforeAccuracyTest);
        int twentyPercentOfInput = (int)((double)approxNumOfTotalUrlsToCheck * 0.2);
        if (urlsToWaitUntilRestartMLA > twentyPercentOfInput) {
            urlsToWaitUntilRestartMLA = twentyPercentOfInput;
        }
        logger.debug("\"urlsToWaitUntilRestartMLA\" = " + urlsToWaitUntilRestartMLA);
    }

    public static void gatherMLData(String docPage, String docUrl, String pageDomain) {
        if (docPage.equals(docUrl)) {
            return;
        }
        Matcher docPageMatcher = null;
        if (pageDomain == null) {
            docPageMatcher = UrlUtils.getUrlMatcher(docPage);
            if (docPageMatcher == null) {
                return;
            }
            pageDomain = UrlUtils.getDomainStr(docPage, docPageMatcher);
            if (pageDomain == null) {
                return;
            }
        }
        if (domainsBlockedFromMLA.contains(pageDomain)) {
            return;
        }
        String docPagePath = UrlUtils.getPathStr(docPage, docPageMatcher);
        if (docPagePath == null) {
            return;
        }
        Matcher docUrlMatcher = UrlUtils.getUrlMatcher(docUrl);
        if (docUrlMatcher == null) {
            return;
        }
        String docUrlPath = UrlUtils.getPathStr(docUrl, docUrlMatcher);
        if (docUrlPath == null) {
            return;
        }
        String docUrlID = UrlUtils.getDocIdStr(docUrl, docUrlMatcher);
        if (docUrlID == null) {
            return;
        }
        Matcher extensionMatcher = FileUtils.EXTENSION_PATTERN.matcher(docUrlID);
        if (extensionMatcher.find()) {
            String extension = null;
            extension = extensionMatcher.group(0);
            if (extension != null) {
                successDocPathsExtensionHashMap.put(docUrlPath, extension);
            }
        }
        successPathsHashMultiMap.put(docPagePath, docUrlPath);
        timesGatheredData.incrementAndGet();
    }

    public static double getCurrentSuccessRate() {
        return (double)(docUrlsFoundByMLA.get() - latestMLADocUrlsFound) * 100.0 / (double)(pageUrlsCheckedWithMLA.get() - latestUrlsMLAChecked);
    }

    public static synchronized boolean shouldRunPrediction() {
        if (!mlaStarted) {
            if (timesGatheredData.get() <= urlsToGatherBeforeStarting) {
                latestSuccessBreakPoint = urlsToGatherBeforeStarting;
                return false;
            }
            mlaStarted = true;
            logger.info("Starting the MLA..");
        }
        if (isInSleepMode) {
            if (totalPagesReachedMLAStage.get() > endOfSleepNumOfUrls) {
                logger.debug("MLA's \"sleepMode\" is finished, it will now restart.");
                isInSleepMode = false;
                return true;
            }
            return false;
        }
        long nextBreakPointForSuccessRate = latestSuccessBreakPoint + leastNumOfUrlsToCheckBeforeAccuracyTest + endOfSleepNumOfUrls;
        if ((long)totalPagesReachedMLAStage.get() < nextBreakPointForSuccessRate) {
            return true;
        }
        double curSuccessRate = MachineLearning.getCurrentSuccessRate();
        logger.debug("Breakpoint (urlNum=" + nextBreakPointForSuccessRate + ") reached. Current round's success rate of MLA = " + PublicationsRetriever.df.format(curSuccessRate) + "%");
        successRateList.add(curSuccessRate);
        if (curSuccessRate >= 51.0) {
            endOfSleepNumOfUrls = 0;
            latestSuccessBreakPoint = totalPagesReachedMLAStage.get();
            return true;
        }
        logger.debug("MLA's success-rate is lower than the satisfying one (51.0). Entering \"sleep-mode\", but continuing to gather ML-data...");
        endOfSleepNumOfUrls = totalPagesReachedMLAStage.get() + urlsToWaitUntilRestartMLA;
        latestMLADocUrlsFound = docUrlsFoundByMLA.get();
        latestUrlsMLAChecked = pageUrlsCheckedWithMLA.get();
        ++latestSuccessBreakPoint;
        isInSleepMode = true;
        return false;
    }

    public static boolean predictInternalDocUrl(String urlId, String sourceUrl, String pageUrl, String pageDomain, HashMap<String, String> pageLinksWithStructure) {
        if (domainsBlockedFromMLA.contains(pageDomain)) {
            logger.debug("Avoiding the MLA-prediction for incompatible domain: \"" + pageDomain + "\".");
            return false;
        }
        Matcher urlMatcher = UrlUtils.getUrlMatcher(pageUrl);
        if (urlMatcher == null) {
            return false;
        }
        String pagePath = UrlUtils.getPathStr(pageUrl, urlMatcher);
        if (pagePath == null) {
            return false;
        }
        Collection knownDocUrlPaths = successPathsHashMultiMap.get((Object)pagePath);
        int pathsSize = knownDocUrlPaths.size();
        if (pathsSize == 0) {
            return false;
        }
        if (pathsSize > 5) {
            logger.warn("Domain: \"" + pageDomain + "\" was blocked from being accessed again by the MLA, after retrieving a proved-to-be incompatible pagePath (having more than 5 possible docUrl-paths).");
            domainsBlockedFromMLA.add(pageDomain);
            successPathsHashMultiMap.removeAll(pagePath);
            return false;
        }
        String docIdStr = UrlUtils.getDocIdStr(pageUrl, urlMatcher);
        if (docIdStr == null) {
            return false;
        }
        if (UrlTypeChecker.PLAIN_PAGE_EXTENSION_FILTER.matcher(docIdStr.toLowerCase()).matches()) {
            docIdStr = FileUtils.EXTENSION_PATTERN.matcher(docIdStr).replaceAll("");
        }
        pageUrlsCheckedWithMLA.incrementAndGet();
        String predictedDocUrl = null;
        String extension = null;
        StringBuilder strB = new StringBuilder(300);
        for (String knownDocUrlPath : knownDocUrlPaths) {
            strB.append(knownDocUrlPath).append(docIdStr);
            extension = successDocPathsExtensionHashMap.get(knownDocUrlPath);
            if (extension != null) {
                strB.append(extension);
            }
            predictedDocUrl = strB.toString();
            strB.setLength(0);
            if (!pageLinksWithStructure.containsKey(predictedDocUrl)) continue;
            logger.debug("Found a \"predictedDocUrl\" which exists in the \"currentPageLinks\": " + predictedDocUrl);
            IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.resultUrlsWithIDs.get(predictedDocUrl);
            if (originalIdUrlMimeTypeTriple != null) {
                logger.info("MachineLearningAlgorithm got a hit for pageUrl: \"" + pageUrl + "\"! Resulted (already found before) docUrl was: \"" + predictedDocUrl + "\"");
                ConnSupportUtils.handleReCrossedTargetUrl(urlId, sourceUrl, pageUrl, predictedDocUrl, originalIdUrlMimeTypeTriple, false);
                docUrlsFoundByMLA.incrementAndGet();
                return true;
            }
            try {
                logger.debug("Going to connect & check predictedDocUrl: \"" + predictedDocUrl + "\", made out from pageUrl: \"" + pageUrl + "\"");
                if (HttpConnUtils.connectAndCheckMimeType(urlId, sourceUrl, pageUrl, predictedDocUrl, null, false, true)) {
                    logger.info("MachineLearningAlgorithm got a hit for pageUrl: \"" + pageUrl + "\"! Resulted docUrl was: \"" + predictedDocUrl + "\"");
                    docUrlsFoundByMLA.incrementAndGet();
                    return true;
                }
                logger.debug("The predictedDocUrl was not a valid docUrl: \"" + predictedDocUrl + "\"");
            }
            catch (Exception exception) {}
        }
        if (ConnSupportUtils.countAndBlockDomainAfterTimes(domainsBlockedFromMLA, timesDomainsFailedInMLA, pageDomain, 10, false)) {
            logger.warn("Domain: \"" + pageDomain + "\" was blocked from being accessed again by the MLA, after proved to be incompatible 10 times.");
            for (String docPath : successPathsHashMultiMap.get((Object)pagePath)) {
                successDocPathsExtensionHashMap.remove(docPath);
            }
            successPathsHashMultiMap.removeAll(pagePath);
        }
        return false;
    }

    public static double getAverageSuccessRate() {
        int sizeOfList = successRateList.size();
        if (sizeOfList == 0) {
            return MachineLearning.getCurrentSuccessRate();
        }
        double sumOfSuccessRates = 0.0;
        Collections.sort(successRateList);
        for (Double curSuccessRate : successRateList) {
            sumOfSuccessRates += curSuccessRate.doubleValue();
        }
        return sumOfSuccessRates / (double)sizeOfList;
    }

    public static void printGatheredData() {
        logger.debug("Here is the MLA data gathered throughout the program's execution:");
        Set docPagePaths = successPathsHashMultiMap.keySet();
        logger.debug("Data was gathered and accepted for " + docPagePaths.size() + " docPagePaths:");
        for (String docPagePath : docPagePaths) {
            logger.debug("\nDocPagePath: " + docPagePath + "\n\tdocUrlPaths:");
            for (String docUrlPath : successPathsHashMultiMap.get((Object)docPagePath)) {
                logger.debug("\tDocUrlPath: " + docUrlPath);
            }
        }
    }

    static {
        domainsBlockedFromMLA.add("sciencedirect.com");
        timesDomainsFailedInMLA = new ConcurrentHashMap();
        successRateList = Collections.synchronizedList(new ArrayList());
    }
}

