package ru.yandex.msearch;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.fast_commit.FastCommitCodec;
import org.apache.lucene.index.codecs.yandex.YandexCodec;
import org.apache.lucene.document.MapFieldSelector;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Bits;

import java.io.File;
import java.util.BitSet;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;

public class GarbageCleaner {
    private GarbageCleaner() {}

    private static final int MAGIC_NUMBER = 65534;

    private static int nullSuid;
    private static int garbageSuid;
    private static int outOfRangeSuid;
    private static int outOfShardSuid;

    private static String prefixField = "suid";
    private static int luceneShard = -1;
    private static int shardsCount = 1000;
    private static BitSet rangesSet;

    public static void main(String[] args) throws Exception {
        String usage =
            "Usage: java ru.yandex.msearch.GarbageCleaner [-i dir] " +
            "[-f prefix_feld] [-l luceneShardNum] [-c luceneShardsCount] [-r start:end] [-r start:end] ....";
        if (args.length < 1 || args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
            System.out.println(usage);
            System.exit(0);
        }

        String index = "index";
        List<Range> ranges = new LinkedList<Range>();

        for (int i = 0; i < args.length; i++) {
            if ("-i".equals(args[i])) {
                index = args[i+1];
                i++;
            } else if ("-f".equals(args[i])) {
                prefixField= args[i+1];
                i++;
            } else if ("-r".equals(args[i])) {
                String[] se = args[i+1].split(":");
                Range r = new Range(Integer.parseInt(se[0]), Integer.parseInt(se[1]));
                ranges.add(r);
                i++;
            } else if ("-l".equals(args[i])) {
                luceneShard = Integer.parseInt(args[i+1]);
                i++;
            } else if ("-c".equals(args[i])) {
                shardsCount = Integer.parseInt(args[i+1]);
                i++;
            }
        }

        rangesSet = new BitSet(MAGIC_NUMBER);
        for (Range r : ranges) {
            System.out.println("Range: start=" + r.start + ", end=" + r.end);
            for (int i = r.start; i <= r.end; i++) {
                rangesSet.set(i);
            }
        }

        CodecProvider cp = CodecProvider.getDefault();
        cp.register(new YandexCodec());
        cp.register(new FastCommitCodec(new HashSet()));

        IndexReader reader = IndexReader.open(NIOFSDirectory.get(new File(index)), false);

        FieldSelector fs = new MapFieldSelector(new String[]{prefixField});

        Bits deleted = MultiFields.getDeletedDocs(reader);

        int deletes = 0;
        System.out.println("Scaning started");
        for (int i = 0; i < reader.maxDoc(); i++) {
            if (i % 10000 == 0) {
                System.out.print("\rscanned: " + i + ", deleted so far: " + deletes + "                  ");
            }
            Document doc;
	    if (deleted != null && deleted.get(i)) {
	        //skip deleted doc
	        continue;
	    }
            doc = reader.document(i, fs);
            if (!validDocument(doc)) {
                reader.deleteDocument(i);
                deletes++;
            }
        }
        System.out.println("Scan finished");
        System.out.println("Deleted: " + deletes);
        System.out.println("Null suid: " + nullSuid);
        System.out.println("Garbase suid: " + garbageSuid);
        System.out.println("Out of lucene shard suid: " + outOfShardSuid);
        System.out.println("Out of Magic shard suid: " + outOfRangeSuid);
        reader.flush();
        reader.close();
    }

    private static boolean validDocument(final Document doc) {
        String prefixString = doc.get(prefixField);
        if (prefixString == null) {
            nullSuid++;
            return false;
        }
        Long prefix;
        try {
            prefix = Long.parseLong(prefixString);
        } catch (NumberFormatException e) {
            garbageSuid++;
            return false;
        }
        if ((int)(prefix % shardsCount) != luceneShard && luceneShard != -1) {
            outOfShardSuid++;
            return false;
        }
        boolean ret = rangesSet.get((int)(prefix % MAGIC_NUMBER));
        if (!ret) outOfRangeSuid++;
        return ret;
    }

    private static class Range {
        public int start;
        public int end;
        public Range(int start, int end) {
            this.start = start;
            this.end = end;
        }
    }
}
