package eu.openaire.publications_retriever.util.url;

import eu.openaire.publications_retriever.crawler.PageCrawler;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:eu/openaire/publications_retriever/util/url/UrlTypeChecker.class */
public class UrlTypeChecker {
    private static final String htOrPhpExtensionsPattern = "(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]?)";
    private static final String mediaExtensionsPattern = "ico|gif|jpg|jpeg|png|wav|mp3|mp4|webm|mkv|mov";
    private static final String docOrDatasetKeywords = "(?:file|pdf|document|dataset|article|fulltext)";
    private static final String wordsPattern = "[\\w/_.,-]{0,100}";
    private static final String docOrDatasetNegativeLookAroundPattern = "(?<![\\w/_.,-]{0,100}(?:file|pdf|document|dataset|article|fulltext)[\\w/_.,-]{0,100})(?!.*(?:file|pdf|document|dataset|article|fulltext).*)";
    private static final Logger logger = LoggerFactory.getLogger((Class<?>) UrlTypeChecker.class);
    public static final Pattern URL_DIRECTORY_FILTER = Pattern.compile("[^/]+://.*/(?:(discover|profile|user|survey|index|media|theme|product|deposit|default|shop|view)/(?<![\\w/_.,-]{0,100}(?:file|pdf|document|dataset|article|fulltext)[\\w/_.,-]{0,100})(?!.*(?:file|pdf|document|dataset|article|fulltext).*)|(?:(?:ldap|password)-)?login|ac[c]?ess(?![./]+)|sign[-]?(?:in|out|up)|session|(?:how-to-)?(:?join[^t]|subscr)|regist(?:er|ration)|submi(?:t|ssion)|(?:post|send|export|(?:wp-)?admin|home|form|career[s]?|company)/|watch|browse|import|bookmark|announcement|feedback|share[^d]|about|(?:[^/]+-)?faq|wiki|news|events|cart|support|(?:site|html)map|documentation|help|license|disclaimer|copyright|(?:site-)?polic(?:y|ies)(?!.*paper)|privacy|terms|law|principles|(?:my|your|create)?[-]?account|my(?:dspace|selection|cart)|(?:service|help)[-]?desk|settings|fund|aut[h]?or(?<![\\w/_.,-]{0,100}(?:file|pdf|document|dataset|article|fulltext)[\\w/_.,-]{0,100})(?!.*(?:file|pdf|document|dataset|article|fulltext).*)|journal/key|(?:journal-)?editor|author:|(?<!ntrs.nasa.gov/(?:api/)?)citation|review|external|facets|statistics|application|selfarchive|permission|ethic(s)?/.*/view/|conta[c]?t|wallet|contribute|donate|our[_-][\\w]+|template|logo|image|photo/|video|advertiser|most-popular|people|(?:the)?press|for-authors|customer-service[s]?|captcha|clipboard|dropdown|widget|(?:forum|blog|column|row|js|css|rss|legal)/|(?:(?:advanced[-]?)?search|search/advanced|search-results|(?:[e]?books|journals)(?:-catalog)?|issue|docs|oai|(?:abstracting-)?indexing|online[-]?early|honors|awards|meetings|calendar|diversity|scholarships|invo(?:ice|lved)|errata|classroom|publish(?:-with-us)?|upload|products|forgot|home|ethics|comics|podcast|trends|bestof|booksellers|recommendations|bibliographic|volume[s]?)[/]?$|rights[-]?permissions|publication[-]?ethics|advertising|reset[-]?password|\\*/|communit(?:y|ies)|restricted|noaccess|crawlprevention|error|(?:mis|ab)use|\\?denied|gateway|defaultwebpage|sorryserver|(?<!response_type=)cookie|(?:page-)?not[-]?found|(?:404(?:_response)?|accessibility|invalid|catalog(?:ue|ar|o)?)\\.(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]?)).*");
    public static final Pattern CURRENTLY_UNSUPPORTED_DOC_EXTENSION_FILTER = Pattern.compile(".+\\.(?:(?:doc|ppt)[x]?|ps|epub|od[tp]|djvu|rtf)(?:\\?.+)?$");
    public static final Pattern URL_FILE_EXTENSION_FILTER = Pattern.compile(".+\\.(?:css|js(?:\\?y)?|ico|gif|jpg|jpeg|png|wav|mp3|mp4|webm|mkv|mov|pt|bib|nt|refer|enw|ris|mso|dtl|do|asc|c|cc(?<![\\w/_.,-]{0,100}(?:file|pdf|document|dataset|article|fulltext)[\\w/_.,-]{0,100})(?!.*(?:file|pdf|document|dataset|article|fulltext).*)|cxx|cpp|java|py)(?:\\?.+)?$");
    public static final Pattern INTERNAL_LINKS_KEYWORDS_FILTER = Pattern.compile(".*(?:doi.org|\\?l[a]?n[g]?=|isallowed=n|site=|linkout|login|linklistener).*");
    public static final Pattern PLAIN_PAGE_EXTENSION_FILTER = Pattern.compile(".+(?<!(?:file|pdf|document|dataset|article|fulltext))\\.(?:(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]?)|[aj]sp[x]?|jsf|do|asc|cgi|cfm)(?:\\?(?!.*(?:file|pdf|document|dataset|article|fulltext)).*)?$");
    public static final Pattern INTERNAL_LINKS_FILE_FORMAT_FILTER = Pattern.compile(".+format=(?:xml|(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]?)|rss|ris|bib).*");
    public static final Pattern SPECIFIC_DOMAIN_FILTER = Pattern.compile("[^/]+://[^/]*(?<=[/.])(?:(?<!drive.)google\\.|goo.gl|gstatic|facebook|fb.me|twitter|(?:meta|xing|baidu|t|x|vk).co|insta(?:gram|paper)|tiktok|youtube|vimeo|linkedin|ebay|bing|(?:amazon|[./]analytics)\\.|s.w.org|wikipedia|myspace|yahoo|mail|pinterest|reddit|tumblr|www.ccdc.cam.ac.uk|figshare.com/collections/|datadryad.org/stash/dataset/|evernote|skype|microsoft|adobe|buffer|digg|stumbleupon|addthis|delicious|dailymotion|gostats|blog(?:ger)?|copyright|friendfeed|newsvine|telegram|getpocket|flipboard|line.me|ok.rudouban|qzone|renren|weibo|doubleclick|bit.ly|github|reviewofbooks|plu.mx|(?<!files.)wordpress|orcid.org|auth(?:orize|entication)?\\.|(?<!manuscript.)elsevier.com|sciencedirect.com|(?:static|multimedia|tienda).elsevier.|arvojournals.org|books.openedition.org|perfdrive.|services.bepress.com|(?:careers|shop).|myworkdayjobs.com|editorialmanager.com)[^/]*/.*");
    public static final Pattern PLAIN_DOMAIN_FILTER = Pattern.compile("[^/]+://[\\w.:-]+(?:/[\\w]{2})?(?:/index.(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]?))?[/]?(?:\\?(?:locale(?:-attribute)?|ln)=[\\w_-]+)?$");
    public static AtomicInteger javascriptPageUrls = new AtomicInteger(0);
    public static AtomicInteger crawlerSensitiveDomains = new AtomicInteger(0);
    public static AtomicInteger doajResultPageUrls = new AtomicInteger(0);
    public static AtomicInteger pagesWithHtmlDocUrls = new AtomicInteger(0);
    public static AtomicInteger pagesRequireLoginToAccessDocFiles = new AtomicInteger(0);
    public static AtomicInteger pagesWithLargerCrawlingDepth = new AtomicInteger(0);
    public static AtomicInteger longToRespondUrls = new AtomicInteger(0);
    public static AtomicInteger urlsWithUnwantedForm = new AtomicInteger(0);
    public static AtomicInteger pangaeaUrls = new AtomicInteger(0);
    public static AtomicInteger pagesNotProvidingDocUrls = new AtomicInteger(0);

    public static boolean matchesUnwantedUrlType(String str, String str2, String str3) {
        if (str3.contains("tandfonline.com") || str3.contains("persee.fr") || str3.contains("papers.ssrn.com") || str3.contains("documentation.ird.fr") || str3.contains("library.unisa.edu.au") || str3.contains("publications.cnr.it")) {
            logger.debug("Url-\"" + str2 + "\": Discarded after matching to a JavaScript-using domain, other than the 'sciencedirect.com'.");
            UrlUtils.logOutputData(str, str2, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded after matching to a JavaScript-using domain, other than the 'sciencedirect.com'.", null, true, "true", "N/A", "false", "false", "false", null, "null");
            if (LoaderAndChecker.useIdUrlPairs) {
                return true;
            }
            javascriptPageUrls.incrementAndGet();
            return true;
        }
        if (str3.contains("doaj.org/toc/")) {
            logger.debug("Url-\"" + str2 + "\": Discarded after matching to the Results-directory: 'doaj.org/toc/'.");
            UrlUtils.logOutputData(str, str2, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded after matching to the Results-directory: 'doaj.org/toc/'.", null, true, "true", "N/A", "false", "false", "false", null, "null");
            if (LoaderAndChecker.useIdUrlPairs) {
                return true;
            }
            doajResultPageUrls.incrementAndGet();
            return true;
        }
        if (str3.contains("dlib.org") || str3.contains("saberes.fcecon.unr.edu.ar") || str3.contains("eumed.net")) {
            logger.debug("Url-\"" + str2 + "\": Discarded after matching to a site containing the full-text as plain-text inside its HTML.");
            UrlUtils.logOutputData(str, str2, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded after matching to a site containing the full-text as plain-text inside its HTML.", null, true, "true", "N/A", "false", "false", "false", null, "null");
            if (LoaderAndChecker.useIdUrlPairs) {
                return true;
            }
            pagesWithHtmlDocUrls.incrementAndGet();
            return true;
        }
        if (str3.contains("rivisteweb.it") || str3.contains("wur.nl") || str3.contains("remeri.org.mx") || str3.contains("cam.ac.uk") || str3.contains("scindeks.ceon.rs") || str3.contains("egms.de")) {
            logger.debug("Url-\"" + str2 + "\": Discarded after matching to a domain which doesn't provide docUrls.");
            UrlUtils.logOutputData(str, str2, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded after matching to a domain which doesn't provide docUrls.", null, true, "true", "N/A", "false", "false", "false", null, "null");
            if (LoaderAndChecker.useIdUrlPairs) {
                return true;
            }
            pagesNotProvidingDocUrls.incrementAndGet();
            return true;
        }
        if (str3.contains("bibliotecadigital.uel.br") || str3.contains("cepr.org")) {
            logger.debug("Url-\"" + str2 + "\": Discarded after matching to a domain which needs login to access docFiles.");
            UrlUtils.logOutputData(str, str2, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded after matching to a domain which needs login to access docFiles.", null, true, "true", "N/A", "false", "false", "false", null, "null");
            if (LoaderAndChecker.useIdUrlPairs) {
                return true;
            }
            pagesRequireLoginToAccessDocFiles.incrementAndGet();
            return true;
        }
        if ((str3.contains("/view/") && !str3.contains(".pdf")) || ((str3.contains("scielosp.org") && !str3.contains("/pdf/")) || str3.contains("dk.um.si") || str3.contains("apospublications.com") || str3.contains("jorr.org") || str3.contains("rwth-aachen.de") || str3.contains("pubmed.ncbi.nlm.nih.gov"))) {
            logger.debug("Url-\"" + str2 + "\": Discarded after matching to a site having its DocUrls in larger depth.");
            UrlUtils.logOutputData(str, str2, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded after matching to a site having its DocUrls in larger depth.", null, true, "true", "N/A", "false", "false", "false", null, "null");
            if (LoaderAndChecker.useIdUrlPairs) {
                return true;
            }
            pagesWithLargerCrawlingDepth.incrementAndGet();
            return true;
        }
        if (str3.contains("doi.org/https://doi.org/") && str3.contains("pangaea.")) {
            logger.debug("Url-\"" + str2 + "\": Discarded after matching to a 'PANGAEA.' url with invalid form and non-docUrls in their internal links.");
            UrlUtils.logOutputData(str, str2, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded after matching to a 'PANGAEA.' url with invalid form and non-docUrls in their internal links.", null, true, "true", "N/A", "false", "false", "false", null, "null");
            if (LoaderAndChecker.useIdUrlPairs) {
                return true;
            }
            pangaeaUrls.incrementAndGet();
            return true;
        }
        if (!LoaderAndChecker.retrieveDatasets && str3.contains("pangaea.")) {
            logger.debug("Url-\"" + str2 + "\": Discarded after matching to a 'PANGAEA.' url which gives only datasets, not full-texts.");
            UrlUtils.logOutputData(str, str2, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded after matching to a 'PANGAEA.' url which gives only datasets, not full-texts.", null, true, "true", "N/A", "false", "false", "false", null, "null");
            if (LoaderAndChecker.useIdUrlPairs) {
                return true;
            }
            pangaeaUrls.incrementAndGet();
            return true;
        }
        if (str3.contains("200.17.137.108")) {
            logger.debug("Url-\"" + str2 + "\": Discarded after matching to known urls with connectivity problems.");
            UrlUtils.logOutputData(str, str2, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded after matching to known urls with connectivity problems.", null, true, "true", "N/A", "false", "false", "false", null, "null");
            if (LoaderAndChecker.useIdUrlPairs) {
                return true;
            }
            LoaderAndChecker.connProblematicUrls.incrementAndGet();
            return true;
        }
        if (str3.contains("sharedsitesession")) {
            ConnSupportUtils.blockSharedSiteSessionDomains(str2, null);
            logger.debug("Url-\"" + str2 + "\": It was discarded after participating in a 'sharedSiteSession-endlessRedirectionPack'.");
            UrlUtils.logOutputData(str, str2, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "It was discarded after participating in a 'sharedSiteSession-endlessRedirectionPack'.", null, true, "true", "N/A", "false", "false", "false", null, "null");
            if (LoaderAndChecker.useIdUrlPairs) {
                return true;
            }
            LoaderAndChecker.connProblematicUrls.incrementAndGet();
            return true;
        }
        if (!shouldNotAcceptPageUrl(str2, str3)) {
            return false;
        }
        logger.debug("Url-\"" + str2 + "\": Discarded after matching to unwantedType-regex-rules.");
        UrlUtils.logOutputData(str, str2, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded after matching to unwantedType-regex-rules.", null, true, "true", "N/A", "false", "false", LoaderAndChecker.COULD_RETRY_URLS.matcher(str2).matches() ? "true" : "false", null, "null");
        if (LoaderAndChecker.useIdUrlPairs) {
            return true;
        }
        urlsWithUnwantedForm.incrementAndGet();
        return true;
    }

    public static boolean shouldNotAcceptPageUrl(String str, String str2) {
        if (str2 == null) {
            str2 = str.toLowerCase();
        }
        return URL_DIRECTORY_FILTER.matcher(str2).matches() || SPECIFIC_DOMAIN_FILTER.matcher(str2).matches() || PLAIN_DOMAIN_FILTER.matcher(str2).matches() || URL_FILE_EXTENSION_FILTER.matcher(str2).matches() || PageCrawler.NON_VALID_DOCUMENT.matcher(str2).matches() || CURRENTLY_UNSUPPORTED_DOC_EXTENSION_FILTER.matcher(str2).matches();
    }

    public static boolean shouldNotAcceptInternalLink(String str, String str2) {
        if (str2 == null) {
            str2 = str.toLowerCase();
        }
        return shouldNotAcceptPageUrl(str, str2) || INTERNAL_LINKS_KEYWORDS_FILTER.matcher(str2).matches() || INTERNAL_LINKS_FILE_FORMAT_FILTER.matcher(str2).matches() || PLAIN_PAGE_EXTENSION_FILTER.matcher(str2).matches();
    }
}
