package eu.openaire.publications_retriever.util.url;

import ch.qos.logback.core.CoreConstants;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import java.util.Collections;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:eu/openaire/publications_retriever/util/url/UrlUtils.class */
public class UrlUtils {
    public static final String duplicateUrlIndicator = "duplicate";
    public static final String unreachableDocOrDatasetUrlIndicator = "unreachable";
    private static final Logger logger = LoggerFactory.getLogger((Class<?>) UrlUtils.class);
    public static final Pattern URL_TRIPLE = Pattern.compile("([^/]+://(?:ww(?:w|\\d)(?:(?:\\w+)?\\.)?)?([\\w.-]+)(?:[:\\d]+)?(?:.*/)?)(?:([^/^;?]*)(?:[;?][^/^=]*(?:=.*)?)?)?");
    public static final Pattern TEMPORAL_IDENTIFIER_FILTER = Pattern.compile("([^/]+://.+)(?:(?:(?i)(?:\\?|&|;|%3b)(?:.*token|jsessionid)(?:=|%3d))[^?&]+)([?&].+)?");
    public static final Pattern ANCHOR_FILTER = Pattern.compile("(.+)(#(?!/).+)");
    public static AtomicInteger sumOfDocUrlsFound = new AtomicInteger(0);
    public static final Set<String> duplicateUrls = Collections.newSetFromMap(new ConcurrentHashMap());
    public static final ConcurrentHashMap<String, IdUrlMimeTypeTriple> docOrDatasetUrlsWithIDs = new ConcurrentHashMap<>();
    public static final ConcurrentHashMap<String, Integer> domainsAndHits = new ConcurrentHashMap<>();
    public static final Pattern TOP_THREE_LEVEL_DOMAIN_FILTER = Pattern.compile("[\\w.-]*?((?:[\\w-]+.)?[\\w-]+.[\\w-]+)$");

    public static void addOutputData(String str, String str2, String str3, String str4, String str5, String str6, boolean z, String str7, String str8, String str9, String str10, String str11, Long l, String str12, String str13) {
        String domainStr;
        String str14 = str4;
        if (!str14.equals(duplicateUrlIndicator)) {
            if (str14.equals(unreachableDocOrDatasetUrlIndicator)) {
                duplicateUrls.add(str2);
            } else {
                sumOfDocUrlsFound.incrementAndGet();
                String lowerCase = str14.toLowerCase();
                if (lowerCase.contains("token") || lowerCase.contains("jsessionid")) {
                    str14 = removeTemporalIdentifier(str14);
                }
                if (z) {
                    docOrDatasetUrlsWithIDs.put(str14, new IdUrlMimeTypeTriple(str, str2, str13));
                }
                if (str6 == null) {
                    str6 = getDomainStr(str3, null);
                }
                if (str6 != null) {
                    ConnSupportUtils.countInsertAndGetTimes(domainsAndHits, str6);
                    if (!str3.equals(str14) && (domainStr = getDomainStr(str14, null)) != null && !domainStr.equals(str6)) {
                        ConnSupportUtils.countInsertAndGetTimes(domainsAndHits, domainStr);
                    }
                }
            }
        }
        FileUtils.dataForOutput.add(new DataForOutput(str, str2, str3, str14, str7, str8, str9, str10, str11, str12, l, str13, str5));
    }

    public static String getDomainStr(String str, Matcher matcher) {
        if (matcher == null) {
            Matcher urlMatcher = getUrlMatcher(str);
            matcher = urlMatcher;
            if (urlMatcher == null) {
                return null;
            }
        }
        try {
            String group = matcher.group(2);
            if (group != null && !group.isEmpty()) {
                return group.toLowerCase();
            }
            logger.warn("No domain was extracted from url: \"" + str + "\".");
            return null;
        } catch (Exception e) {
            logger.error("", (Throwable) e);
            return null;
        }
    }

    public static String getPathStr(String str, Matcher matcher) {
        if (matcher == null) {
            Matcher urlMatcher = getUrlMatcher(str);
            matcher = urlMatcher;
            if (urlMatcher == null) {
                return null;
            }
        }
        try {
            String group = matcher.group(1);
            if (group != null && !group.isEmpty()) {
                return group;
            }
            logger.warn("No pathStr was extracted from url: \"" + str + "\".");
            return null;
        } catch (Exception e) {
            logger.error("", (Throwable) e);
            return null;
        }
    }

    public static String getDocIdStr(String str, Matcher matcher) {
        if (matcher == null) {
            Matcher urlMatcher = getUrlMatcher(str);
            matcher = urlMatcher;
            if (urlMatcher == null) {
                return null;
            }
        }
        try {
            String group = matcher.group(3);
            if (group != null && !group.isEmpty()) {
                return group;
            }
            logger.warn("No docID was extracted from url: \"" + str + "\".");
            return null;
        } catch (Exception e) {
            logger.error("", (Throwable) e);
            return null;
        }
    }

    public static Matcher getUrlMatcher(String str) {
        if (str == null) {
            logger.error("The received \"urlStr\" was null in \"getUrlMatcher()\"!");
            return null;
        }
        if (str.endsWith("/")) {
            str = str.substring(0, str.length() - 1);
        }
        Matcher matcher = URL_TRIPLE.matcher(str);
        if (matcher.matches()) {
            return matcher;
        }
        logger.warn("Unexpected URL_TRIPLE's (" + matcher.toString() + ") mismatch for url: \"" + str + "\"");
        return null;
    }

    public static String getTopThreeLevelDomain(String str) {
        Matcher matcher = TOP_THREE_LEVEL_DOMAIN_FILTER.matcher(str);
        if (matcher.matches()) {
            try {
                str = matcher.group(1);
            } catch (Exception e) {
                logger.warn("Could not find the group < 1 > when retrieving the top-three-level-domain from \"" + str + "\"");
                return str;
            }
        } else {
            logger.warn("Could not retrieve the top-three-level-domain from \"" + str + "\"");
        }
        return str;
    }

    public static String removeTemporalIdentifier(String str) {
        if (str == null) {
            logger.error("The received \"urlStr\" was null in \"removeTemporalIdentifier()\"!");
            return "null";
        }
        Matcher matcher = TEMPORAL_IDENTIFIER_FILTER.matcher(str);
        if (!matcher.matches()) {
            return str;
        }
        try {
            String group = matcher.group(1);
            if (group == null || group.isEmpty()) {
                logger.warn("Unexpected null or empty value returned by \"temporalIdMatcher.group(1)\" for url: \"" + str + "\"");
                return str;
            }
            try {
                String group2 = matcher.group(2);
                if (group2 == null || group2.isEmpty()) {
                    return group;
                }
                if (group2.startsWith("&", 0) && !group.contains(CoreConstants.NA)) {
                    group2 = StringUtils.replace(group2, "&", CoreConstants.NA, 1);
                }
                return group + group2;
            } catch (Exception e) {
                logger.error("", (Throwable) e);
                return group;
            }
        } catch (Exception e2) {
            logger.error("", (Throwable) e2);
            return str;
        }
    }

    public static String removeAnchor(String str) {
        if (str == null) {
            logger.error("The received \"urlStr\" was null in \"removeAnchor()\"!");
            return null;
        }
        Matcher matcher = ANCHOR_FILTER.matcher(str);
        if (!matcher.matches()) {
            return str;
        }
        try {
            String group = matcher.group(1);
            if (group != null && !group.isEmpty()) {
                return group;
            }
            logger.warn("Unexpected null or empty value returned by \"anchorMatcher.group(1)\" for url: \"" + str + "\"");
            return str;
        } catch (Exception e) {
            logger.error("", (Throwable) e);
            return str;
        }
    }
}
