package eu.dnetlib.dedup;

import com.google.common.collect.Sets;
import com.wcohen.ss.JaroWinkler;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.Person;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.text.Normalizer;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.util.LongAccumulator;
import scala.Tuple2;

/* loaded from: input_file:eu/dnetlib/dedup/DedupUtility.class */
public class DedupUtility {
    private static final Double THRESHOLD = Double.valueOf(0.95d);

    public static Map<String, LongAccumulator> constructAccumulator(DedupConfig dedupConfig, SparkContext sparkContext) {
        HashMap hashMap = new HashMap();
        String format = String.format("%s::%s", dedupConfig.getWf().getEntityType(), "records per hash key = 1");
        hashMap.put(format, sparkContext.longAccumulator(format));
        String format2 = String.format("%s::%s", dedupConfig.getWf().getEntityType(), "missing " + dedupConfig.getWf().getOrderField());
        hashMap.put(format2, sparkContext.longAccumulator(format2));
        String format3 = String.format("%s::%s", dedupConfig.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConfig.getWf().getOrderField(), Integer.valueOf(dedupConfig.getWf().getGroupMaxSize())));
        hashMap.put(format3, sparkContext.longAccumulator(format3));
        String format4 = String.format("%s::%s", dedupConfig.getWf().getEntityType(), "skip list");
        hashMap.put(format4, sparkContext.longAccumulator(format4));
        String format5 = String.format("%s::%s", dedupConfig.getWf().getEntityType(), "dedupSimilarity (x2)");
        hashMap.put(format5, sparkContext.longAccumulator(format5));
        String format6 = String.format("%s::%s", dedupConfig.getWf().getEntityType(), "d < " + dedupConfig.getWf().getThreshold());
        hashMap.put(format6, sparkContext.longAccumulator(format6));
        return hashMap;
    }

    public static JavaRDD<String> loadDataFromHDFS(String str, JavaSparkContext javaSparkContext) {
        return javaSparkContext.textFile(str);
    }

    public static void deleteIfExists(String str) throws IOException {
        FileSystem fileSystem = FileSystem.get(new Configuration());
        if (fileSystem.exists(new Path(str))) {
            fileSystem.delete(new Path(str), true);
        }
    }

    public static DedupConfig loadConfigFromHDFS(String str) throws IOException {
        return DedupConfig.load(IOUtils.toString(new FSDataInputStream(FileSystem.get(new Configuration()).open(new Path(str))), StandardCharsets.UTF_8.name()));
    }

    static <T> String readFromClasspath(String str, Class<T> cls) {
        StringWriter stringWriter = new StringWriter();
        try {
            IOUtils.copy(cls.getResourceAsStream(str), stringWriter);
            return stringWriter.toString();
        } catch (IOException e) {
            throw new RuntimeException("cannot load resource from classpath: " + str);
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static Set<String> getGroupingKeys(DedupConfig dedupConfig, MapDocument mapDocument) {
        return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(mapDocument, dedupConfig));
    }

    public static String md5(String str) {
        try {
            MessageDigest messageDigest = MessageDigest.getInstance("MD5");
            messageDigest.update(str.getBytes("UTF-8"));
            return new String(Hex.encodeHex(messageDigest.digest()));
        } catch (Exception e) {
            System.err.println("Error creating id");
            return null;
        }
    }

    public static List<Author> mergeAuthor(List<Author> list, List<Author> list2) {
        List<Author> list3;
        List<Author> list4;
        int countAuthorsPids = countAuthorsPids(list);
        int countAuthorsPids2 = countAuthorsPids(list2);
        int authorsSize = authorsSize(list);
        int authorsSize2 = authorsSize(list2);
        if (countAuthorsPids == countAuthorsPids2) {
            list3 = authorsSize > authorsSize2 ? list : list2;
            list4 = authorsSize > authorsSize2 ? list2 : list;
        } else {
            list3 = countAuthorsPids > countAuthorsPids2 ? list : list2;
            list4 = countAuthorsPids > countAuthorsPids2 ? list2 : list;
        }
        enrichPidFromList(list3, list4);
        return list3;
    }

    private static void enrichPidFromList(List<Author> list, List<Author> list2) {
        if (list == null || list2 == null) {
            return;
        }
        Map map = (Map) list.stream().filter(author -> {
            return author.getPid() != null && author.getPid().size() > 0;
        }).flatMap(author2 -> {
            return author2.getPid().stream().map(structuredProperty -> {
                return new Tuple2(structuredProperty.toComparableString(), author2);
            });
        }).collect(Collectors.toMap((v0) -> {
            return v0._1();
        }, (v0) -> {
            return v0._2();
        }, (author3, author4) -> {
            return author3;
        }));
        ((List) list2.stream().filter(author5 -> {
            return author5.getPid() != null && author5.getPid().size() > 0;
        }).flatMap(author6 -> {
            return author6.getPid().stream().filter(structuredProperty -> {
                return !map.containsKey(structuredProperty.toComparableString());
            }).map(structuredProperty2 -> {
                return new Tuple2(structuredProperty2, author6);
            });
        }).collect(Collectors.toList())).forEach(tuple2 -> {
            Optional max = list.stream().map(author7 -> {
                return new Tuple2(sim(author7, (Author) tuple2._2()), author7);
            }).max(Comparator.comparing((v0) -> {
                return v0._1();
            }));
            if (!max.isPresent() || ((Double) ((Tuple2) max.get())._1()).doubleValue() <= THRESHOLD.doubleValue()) {
                return;
            }
            ((Author) ((Tuple2) max.get())._2()).getPid().add(tuple2._1());
        });
    }

    public static String createEntityPath(String str, String str2) {
        return String.format("%s/%s", str, str2);
    }

    public static String createSimRelPath(String str, String str2) {
        return String.format("%s/%s_simRel", str, str2);
    }

    public static String createMergeRelPath(String str, String str2) {
        return String.format("%s/%s_mergeRel", str, str2);
    }

    private static Double sim(Author author, Author author2) {
        Person parse = parse(author);
        Person parse2 = parse(author2);
        return parse.isAccurate() & parse2.isAccurate() ? Double.valueOf(new JaroWinkler().score(normalize(parse.getSurnameString()), normalize(parse2.getSurnameString()))) : Double.valueOf(new JaroWinkler().score(normalize(parse.getNormalisedFullname()), normalize(parse2.getNormalisedFullname())));
    }

    private static String normalize(String str) {
        return nfd(str).toLowerCase().replaceAll("(\\W)+", " ").replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ").replaceAll("(\\p{Punct})+", " ").replaceAll("(\\d)+", " ").replaceAll("(\\n)+", " ").trim();
    }

    private static String nfd(String str) {
        return Normalizer.normalize(str, Normalizer.Form.NFD);
    }

    private static Person parse(Author author) {
        return StringUtils.isNotBlank(author.getSurname()) ? new Person(author.getSurname() + ", " + author.getName(), false) : new Person(author.getFullname(), false);
    }

    private static int countAuthorsPids(List<Author> list) {
        if (list == null) {
            return 0;
        }
        return (int) list.stream().filter(DedupUtility::hasPid).count();
    }

    private static int authorsSize(List<Author> list) {
        if (list == null) {
            return 0;
        }
        return list.size();
    }

    private static boolean hasPid(Author author) {
        if (author == null || author.getPid() == null || author.getPid().size() == 0) {
            return false;
        }
        return author.getPid().stream().anyMatch(structuredProperty -> {
            return structuredProperty != null && StringUtils.isNotBlank(structuredProperty.getValue());
        });
    }
}
