package eu.dnetlib.pace.common;

import com.ibm.icu.text.Transliterator;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import repackaged.com.google.common.google.common.base.Joiner;
import repackaged.com.google.common.google.common.collect.Sets;

/* loaded from: input_file:eu/dnetlib/pace/common/AbstractPaceFunctions.class */
public class AbstractPaceFunctions extends PaceCommonUtils {
    private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
    private static Map<String, String> cityMap = loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
    private static Map<String, String> keywordMap = loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
    private static Map<String, String> countryMap = loadCountryMapFromClasspath("/eu/dnetlib/pace/config/country_map.csv");
    protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
    protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
    protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
    protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
    protected static Set<String> stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
    protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
    protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
    protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
    public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
    public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
    private static Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
    private static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
    private static Pattern romanNumberPattern = Pattern.compile("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$");

    /* JADX INFO: Access modifiers changed from: protected */
    public static String concat(List<String> list) {
        return Joiner.on(" ").skipNulls().join((Iterable<?>) list);
    }

    public static String cleanup(String str) {
        return fixAliases(transliterate(fixXML(nfd(unicodeNormalization(HTML_REGEX.matcher(str).replaceAll("").toLowerCase()))).replaceAll("([0-9]+)", " $1 "))).replaceAll("[^\\p{ASCII}]", "").replaceAll("[\\p{Punct}]", " ").replaceAll("\\n", " ").replaceAll("(?m)\\s+", " ").trim();
    }

    public static String countryInference(String str, String str2) {
        return !str.equalsIgnoreCase("unknown") ? str : citiesToCountry(getCities(filterAllStopWords(normalize(cleanup(str2))), 4)).stream().filter((v0) -> {
            return Objects.nonNull(v0);
        }).findFirst().orElse(ModelConstants.UNKNOWN);
    }

    public static String cityInference(String str) {
        String filterAllStopWords = filterAllStopWords(normalize(cleanup(str)));
        for (String str2 : getCities(filterAllStopWords, 4)) {
            filterAllStopWords = filterAllStopWords.replaceAll(str2, cityMap.get(str2));
        }
        return filterAllStopWords;
    }

    public static String keywordInference(String str) {
        String filterAllStopWords = filterAllStopWords(normalize(cleanup(str)));
        for (String str2 : getKeywords(filterAllStopWords, keywordMap, 4)) {
            filterAllStopWords = filterAllStopWords.replaceAll(str2, keywordMap.get(str2));
        }
        return filterAllStopWords;
    }

    public static String cityKeywordInference(String str) {
        String filterAllStopWords = filterAllStopWords(normalize(cleanup(str)));
        Set<String> keywords = getKeywords(filterAllStopWords, keywordMap, 4);
        Set<String> cities = getCities(filterAllStopWords, 4);
        for (String str2 : keywords) {
            filterAllStopWords = filterAllStopWords.replaceAll(str2, keywordMap.get(str2));
        }
        for (String str3 : cities) {
            filterAllStopWords = filterAllStopWords.replaceAll(str3, cityMap.get(str3));
        }
        return filterAllStopWords;
    }

    protected static String fixXML(String str) {
        return str.replaceAll("&ndash;", " ").replaceAll("&amp;", " ").replaceAll("&quot;", " ").replaceAll("&minus;", " ");
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static boolean checkNumbers(String str, String str2) {
        return (getNumbers(str).equals(getNumbers(str2)) && getRomans(str).equals(getRomans(str2))) ? false : true;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static String getRomans(String str) {
        StringBuilder sb = new StringBuilder();
        for (String str2 : str.split(" ")) {
            sb.append(isRoman(str2) ? str2 : "");
        }
        return sb.toString();
    }

    protected static boolean isRoman(String str) {
        Matcher matcher = romanNumberPattern.matcher(str);
        return matcher.matches() && matcher.hitEnd();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static String getNumbers(String str) {
        StringBuilder sb = new StringBuilder();
        for (String str2 : str.split(" ")) {
            sb.append(isNumber(str2) ? str2 : "");
        }
        return sb.toString();
    }

    public static boolean isNumber(String str) {
        if (str == null) {
            return false;
        }
        return numberPattern.matcher(str).matches();
    }

    protected static String removeSymbols(String str) {
        StringBuilder sb = new StringBuilder();
        str.chars().forEach(i -> {
            sb.append(StringUtils.contains(alpha, i) ? (char) i : ' ');
        });
        return sb.toString().replaceAll("\\s+", " ");
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static boolean notNull(String str) {
        return str != null;
    }

    public static String utf8(String str) {
        return new String(str.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8);
    }

    public static String unicodeNormalization(String str) {
        Matcher matcher = hexUnicodePattern.matcher(str);
        StringBuffer stringBuffer = new StringBuffer(str.length());
        while (matcher.find()) {
            matcher.appendReplacement(stringBuffer, Matcher.quoteReplacement(String.valueOf((char) Integer.parseInt(matcher.group(1), 16))));
        }
        matcher.appendTail(stringBuffer);
        return stringBuffer.toString();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static String filterStopWords(String str, Set<String> set) {
        StringTokenizer stringTokenizer = new StringTokenizer(str);
        StringBuilder sb = new StringBuilder();
        while (stringTokenizer.hasMoreTokens()) {
            String nextToken = stringTokenizer.nextToken();
            if (!set.contains(nextToken)) {
                sb.append(nextToken);
                sb.append(" ");
            }
        }
        return sb.toString().trim();
    }

    public static String filterAllStopWords(String str) {
        return filterStopWords(filterStopWords(filterStopWords(filterStopWords(filterStopWords(filterStopWords(filterStopWords(str, stopwords_en), stopwords_de), stopwords_it), stopwords_fr), stopwords_pt), stopwords_es), stopwords_gr);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static Collection<String> filterBlacklisted(Collection<String> collection, Set<String> set) {
        LinkedHashSet newLinkedHashSet = Sets.newLinkedHashSet();
        for (String str : collection) {
            if (!set.contains(str)) {
                newLinkedHashSet.add(str);
            }
        }
        return newLinkedHashSet;
    }

    public static Map<String, String> loadMapFromClasspath(String str) {
        Transliterator transliterator = Transliterator.getInstance("Any-Eng");
        HashMap hashMap = new HashMap();
        try {
            Iterator<String> it = IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(str), StandardCharsets.UTF_8).iterator();
            while (it.hasNext()) {
                String[] split = it.next().split(";");
                String str2 = split[0];
                for (int i = 1; i < split.length; i++) {
                    hashMap.put(fixAliases(transliterator.transliterate(split[i].toLowerCase())), str2);
                }
            }
            return hashMap;
        } catch (Throwable th) {
            return new HashMap();
        }
    }

    public static Map<String, String> loadCountryMapFromClasspath(String str) {
        Transliterator transliterator = Transliterator.getInstance("Any-Eng");
        HashMap hashMap = new HashMap();
        try {
            Iterator<String> it = IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(str), StandardCharsets.UTF_8).iterator();
            while (it.hasNext()) {
                String[] split = it.next().split(";");
                String str2 = split[0];
                for (int i = 1; i < split.length; i++) {
                    hashMap.put(cityMap.get(fixAliases(transliterator.transliterate(split[i].toLowerCase()))), str2);
                }
            }
            return hashMap;
        } catch (Throwable th) {
            return new HashMap();
        }
    }

    public static String removeKeywords(String str, Set<String> set) {
        String str2 = " " + str + " ";
        Iterator<String> it = set.iterator();
        while (it.hasNext()) {
            str2 = str2.replaceAll(it.next().toLowerCase(), "");
        }
        return str2.trim();
    }

    public static double commonElementsPercentage(Set<String> set, Set<String> set2) {
        double max = Math.max(set.size(), set2.size());
        Stream<String> stream = set.stream();
        Objects.requireNonNull(set2);
        return stream.filter((v1) -> {
            return r1.contains(v1);
        }).count() / max;
    }

    public static Set<String> toCodes(Set<String> set, Map<String, String> map) {
        return (Set) set.stream().map(str -> {
            return (String) map.get(str);
        }).collect(Collectors.toSet());
    }

    public static Set<String> keywordsToCodes(Set<String> set, Map<String, String> map) {
        return toCodes(set, map);
    }

    public static Set<String> citiesToCodes(Set<String> set) {
        return toCodes(set, cityMap);
    }

    public static Set<String> citiesToCountry(Set<String> set) {
        return toCodes(toCodes(set, cityMap), countryMap);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static String firstLC(String str) {
        return StringUtils.substring(str, 0, 1).toLowerCase();
    }

    public static String normalizePid(String str) {
        return DOI_PREFIX.matcher(str.toLowerCase()).replaceAll("");
    }

    public static Set<String> getKeywords(String str, Map<String, String> map, int i) {
        String str2 = str;
        List asList = Arrays.asList(str2.toLowerCase().split(" "));
        HashSet hashSet = new HashSet();
        if (asList.size() < i) {
            i = asList.size();
        }
        for (int i2 = i; i2 != 0; i2--) {
            for (int i3 = 0; i3 <= asList.size() - i2; i3++) {
                String concat = concat(asList.subList(i3, i3 + i2));
                if (map.containsKey(concat)) {
                    hashSet.add(concat);
                    str2 = str2.replace(concat, "").trim();
                }
            }
            asList = Arrays.asList(str2.split(" "));
        }
        return hashSet;
    }

    public static Set<String> getCities(String str, int i) {
        return getKeywords(str, cityMap, i);
    }

    public static <T> String readFromClasspath(String str, Class<T> cls) {
        StringWriter stringWriter = new StringWriter();
        try {
            IOUtils.copy(cls.getResourceAsStream(str), stringWriter, StandardCharsets.UTF_8);
            return stringWriter.toString();
        } catch (IOException e) {
            throw new RuntimeException("cannot load resource from classpath: " + str);
        }
    }
}
