package eu.dnetlib.data.mapreduce.hbase.dedup;

import java.io.IOException;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;
import java.util.Set;

import eu.dnetlib.data.mapreduce.JobParams;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;

import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

import eu.dnetlib.data.mapreduce.hbase.VolatileColumnFamily;
import eu.dnetlib.pace.model.PersonComparatorUtils;

public class FindDedupCandidatePersonsReducer extends TableReducer<Text, Text, ImmutableBytesWritable> {

	private static final int LIMIT = 5000;

	@Override
	protected void setup(final Context context) throws IOException, InterruptedException {

	}

	@Override
	protected void reduce(final Text key, final Iterable<Text> values, final Context context) throws IOException, InterruptedException {
		System.out.println("\nReducing key: " + key);

		final Set<String> candidates = Sets.newHashSet();
		final Map<String, Set<String>> resultIds = Maps.newHashMap();

		final Queue<DedupPersonBean> queue = prepare(context, key, values);
		while (!queue.isEmpty()) {

			final DedupPersonBean pivot = queue.remove();

			for (final DedupPersonBean curr : queue) {

				if (PersonComparatorUtils.areSimilar(pivot.getName(), curr.getName())) {
					System.out.println("- Similar persons: [" + pivot.getName() + "] - [" + curr.getName() + "]");
					candidates.add(pivot.getId());
					candidates.add(curr.getId());

					collectResultIds(resultIds, pivot);
					collectResultIds(resultIds, curr);
				}
			}
		}

		emitCandidates(context, candidates);
		emitResultCandidates(context, resultIds);
	}

	private void collectResultIds(final Map<String, Set<String>> resultIds, final DedupPersonBean person) {
		if (!resultIds.containsKey(person.getId())) {
			resultIds.put(person.getId(), new HashSet<String>());
		}
		resultIds.get(person.getId()).addAll(person.getResults());
	}

	private Queue<DedupPersonBean> prepare(final Context context, final Text key, final Iterable<Text> values) {
		final Queue<DedupPersonBean> queue = new LinkedList<DedupPersonBean>();

		for (final Text i : values) {
			queue.add(DedupPersonBean.fromText(i));

			if (queue.size() > LIMIT) {
				context.getCounter("Comparison list > " + LIMIT, "'" + key.toString() + "', --> " + context.getTaskAttemptID()).increment(1);
				System.out.println("breaking out after limit (" + LIMIT + ") for key '" + key);
				break;
			}
		}

		return queue;
	}

	private void emitCandidates(final Context context, final Set<String> candidates) throws IOException, InterruptedException {
		final byte[] cf = Bytes.toBytes(VolatileColumnFamily.dedup.toString());
		final byte[] col = Bytes.toBytes("isCandidate");
		final byte[] val = Bytes.toBytes("");

		for (final String s : candidates) {
			final byte[] id = Bytes.toBytes(s);
			final Put put = new Put(id).add(cf, col, val);
			put.setWriteToWAL(JobParams.WRITE_TO_WAL);
			context.write(new ImmutableBytesWritable(id), put);
		}
		context.getCounter(getClass().getSimpleName(), "N. Put. (persons)").increment(candidates.size());
	}

	private void emitResultCandidates(final Context context, final Map<String, Set<String>> resultIds) throws IOException, InterruptedException {
		final byte[] cf = Bytes.toBytes(VolatileColumnFamily.dedupPerson.toString());
		final byte[] val = Bytes.toBytes("");

		for (final String personId : resultIds.keySet()) {
			final byte[] col = Bytes.toBytes(personId);

			for (final String s : resultIds.get(personId)) {
				final byte[] id = Bytes.toBytes(s);
				final Put put = new Put(id).add(cf, col, val);
				put.setWriteToWAL(JobParams.WRITE_TO_WAL);
				context.write(new ImmutableBytesWritable(id), put);
			}
			context.getCounter(getClass().getSimpleName(), "N. Put. (results)").increment(resultIds.get(personId).size());
		}
	}

}
