package eu.dnetlib.pace.common;

import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.collect.UnmodifiableIterator;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldListImpl;
import java.io.IOException;
import java.io.StringWriter;
import java.text.Normalizer;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;

/* loaded from: input_file:eu/dnetlib/pace/common/AbstractPaceFunctions.class */
public abstract class AbstractPaceFunctions {
    private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
    private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
    private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
    public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
    private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
    private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
    private static Map<String, String> cityMap = loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
    protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
    protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
    protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
    protected static Set<String> stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
    protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
    protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
    protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
    protected static final FieldList EMPTY_FIELD = new FieldListImpl();

    /* JADX INFO: Access modifiers changed from: protected */
    public String concat(List<String> list) {
        return Joiner.on(" ").skipNulls().join(list);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String cleanup(String str) {
        return nfd(fixAliases(unicodeNormalization(str.toLowerCase()))).replaceAll("&ndash;", " ").replaceAll("&amp;", " ").replaceAll("&quot;", " ").replaceAll("&minus;", " ").replaceAll("([0-9]+)", " $1 ").replaceAll("[^\\p{ASCII}]", "").replaceAll("[\\p{Punct}]", " ").replaceAll("\\n", " ").replaceAll("(?m)\\s+", " ").trim();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public boolean checkNumbers(String str, String str2) {
        return (getNumbers(str).equals(getNumbers(str2)) && getRomans(str).equals(getRomans(str2))) ? false : true;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String getRomans(String str) {
        StringBuilder sb = new StringBuilder();
        for (String str2 : str.split(" ")) {
            sb.append(isRoman(str2) ? str2 : "");
        }
        return sb.toString();
    }

    protected boolean isRoman(String str) {
        return str.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String getNumbers(String str) {
        StringBuilder sb = new StringBuilder();
        for (String str2 : str.split(" ")) {
            sb.append(isNumber(str2) ? str2 : "");
        }
        return sb.toString();
    }

    public boolean isNumber(String str) {
        if (str == null) {
            return false;
        }
        return this.numberPattern.matcher(str).matches();
    }

    protected static String fixAliases(String str) {
        StringBuilder sb = new StringBuilder();
        UnmodifiableIterator it = Lists.charactersOf(str).iterator();
        while (it.hasNext()) {
            char charValue = ((Character) it.next()).charValue();
            int indexOf = StringUtils.indexOf(aliases_from, charValue);
            sb.append(indexOf >= 0 ? aliases_to.charAt(indexOf) : charValue);
        }
        return sb.toString();
    }

    protected String removeSymbols(String str) {
        StringBuilder sb = new StringBuilder();
        UnmodifiableIterator it = Lists.charactersOf(str).iterator();
        while (it.hasNext()) {
            char charValue = ((Character) it.next()).charValue();
            sb.append(StringUtils.contains(alpha, charValue) ? Character.valueOf(charValue) : " ");
        }
        return sb.toString().replaceAll("\\s+", " ");
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String getFirstValue(Field field) {
        return (field == null || Iterables.isEmpty(field)) ? "" : ((Field) Iterables.getFirst(field, EMPTY_FIELD)).stringValue();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public boolean notNull(String str) {
        return str != null;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String normalize(String str) {
        return nfd(unicodeNormalization(str)).toLowerCase().replaceAll("[^ \\w]+", "").replaceAll("(\\p{InCombiningDiacriticalMarks})+", "").replaceAll("(\\p{Punct})+", " ").replaceAll("(\\d)+", " ").replaceAll("(\\n)+", " ").trim();
    }

    public String nfd(String str) {
        return Normalizer.normalize(str, Normalizer.Form.NFD);
    }

    public String unicodeNormalization(String str) {
        Matcher matcher = this.hexUnicodePattern.matcher(str);
        StringBuffer stringBuffer = new StringBuffer(str.length());
        while (matcher.find()) {
            matcher.appendReplacement(stringBuffer, Matcher.quoteReplacement(String.valueOf((char) Integer.parseInt(matcher.group(1), 16))));
        }
        matcher.appendTail(stringBuffer);
        return stringBuffer.toString();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String filterStopWords(String str, Set<String> set) {
        StringTokenizer stringTokenizer = new StringTokenizer(str);
        StringBuilder sb = new StringBuilder();
        while (stringTokenizer.hasMoreTokens()) {
            String nextToken = stringTokenizer.nextToken();
            if (!set.contains(nextToken)) {
                sb.append(nextToken);
                sb.append(" ");
            }
        }
        return sb.toString().trim();
    }

    public String filterAllStopWords(String str) {
        return filterStopWords(filterStopWords(filterStopWords(filterStopWords(filterStopWords(filterStopWords(str, stopwords_en), stopwords_de), stopwords_it), stopwords_fr), stopwords_pt), stopwords_es);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Collection<String> filterBlacklisted(Collection<String> collection, Set<String> set) {
        LinkedHashSet newLinkedHashSet = Sets.newLinkedHashSet();
        for (String str : collection) {
            if (!set.contains(str)) {
                newLinkedHashSet.add(str);
            }
        }
        return newLinkedHashSet;
    }

    public static Set<String> loadFromClasspath(String str) {
        HashSet newHashSet = Sets.newHashSet();
        try {
            Iterator it = IOUtils.readLines(NGramUtils.class.getResourceAsStream(str)).iterator();
            while (it.hasNext()) {
                newHashSet.add((String) it.next());
            }
            return newHashSet;
        } catch (Throwable th) {
            return Sets.newHashSet();
        }
    }

    public static Map<String, String> loadMapFromClasspath(String str) {
        HashMap hashMap = new HashMap();
        try {
            Iterator it = IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(str)).iterator();
            while (it.hasNext()) {
                String[] split = ((String) it.next()).split(";");
                String str2 = split[0];
                for (int i = 1; i < split.length; i++) {
                    hashMap.put(split[i].toLowerCase(), str2);
                }
            }
            return hashMap;
        } catch (Throwable th) {
            return new HashMap();
        }
    }

    public String removeKeywords(String str, Set<String> set) {
        String str2 = " " + str + " ";
        Iterator<String> it = set.iterator();
        while (it.hasNext()) {
            str2 = str2.replaceAll(it.next().toLowerCase(), "");
        }
        return str2.trim();
    }

    public double commonElementsPercentage(Set<String> set, Set<String> set2) {
        double max = Math.max(set.size(), set2.size());
        Stream<String> stream = set.stream();
        set2.getClass();
        return stream.filter((v1) -> {
            return r1.contains(v1);
        }).count() / max;
    }

    public Set<String> toCodes(Set<String> set, Map<String, String> map) {
        return (Set) set.stream().map(str -> {
            return (String) map.get(str);
        }).collect(Collectors.toSet());
    }

    public Set<String> keywordsToCodes(Set<String> set, Map<String, String> map) {
        return toCodes(set, map);
    }

    public Set<String> citiesToCodes(Set<String> set) {
        return toCodes(set, cityMap);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String firstLC(String str) {
        return StringUtils.substring(str, 0, 1).toLowerCase();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Iterable<String> tokens(String str, int i) {
        return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(str), i);
    }

    public String normalizePid(String str) {
        return str.toLowerCase().replaceAll("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)", "");
    }

    public Set<String> getKeywords(String str, Map<String, String> map, int i) {
        String str2 = str;
        List asList = Arrays.asList(str2.toLowerCase().split(" "));
        HashSet hashSet = new HashSet();
        if (asList.size() < i) {
            i = asList.size();
        }
        for (int i2 = i; i2 != 0; i2--) {
            for (int i3 = 0; i3 <= asList.size() - i2; i3++) {
                String concat = concat(asList.subList(i3, i3 + i2));
                if (map.containsKey(concat)) {
                    hashSet.add(concat);
                    str2 = str2.replace(concat, "").trim();
                }
            }
            asList = Arrays.asList(str2.split(" "));
        }
        return hashSet;
    }

    public Set<String> getCities(String str, int i) {
        return getKeywords(str, cityMap, i);
    }

    public static <T> String readFromClasspath(String str, Class<T> cls) {
        StringWriter stringWriter = new StringWriter();
        try {
            IOUtils.copy(cls.getResourceAsStream(str), stringWriter);
            return stringWriter.toString();
        } catch (IOException e) {
            throw new RuntimeException("cannot load resource from classpath: " + str);
        }
    }
}
