package ru.yandex.msearch;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.yandex.YandexCodec;
import org.apache.lucene.document.MapFieldSelector;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Bits;

import com.tecnick.htmlutils.htmlentities.HTMLEntities;

import java.net.URLDecoder;

import java.io.File;
import java.util.HashSet;
import java.util.Arrays;

/** Simple command-line based search demo. */
public class ListXURLS {

  /** Use the norms from one field for all fields.  Norms are read into memory,
   * using a byte of memory per document per searched field.  This can cause
   * search of large collections with a large number of fields to run out of
   * memory.  If all of the fields contain only a single token, then the norms
   * are all identical, then single norm vector may be shared. */
  private ListXURLS() {}

  /** Simple command-line based search demo. */
  public static void main(String[] args) throws Exception {
    String usage =
      "Usage: java ru.yandex.msearch.ListXURLS -index dir -from unixtime1 -to unixtime2 -shards shardsCount";
    if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
      System.out.println(usage);
      System.exit(0);
    }

    String index = "index";
    long fromTime = (System.currentTimeMillis() / 1000) - 86400 * 30;
    long toTime = System.currentTimeMillis() / 1000;
    int shardsCount = 1000;

    for (int i = 0; i < args.length; i++) {
      if ("-index".equals(args[i])) {
        index = args[i+1];
        i++;
      } else if( "-from".equals(args[i]) ) {
        fromTime = Long.parseLong(args[i+1]);
        i++;
      } else if( "-to".equals(args[i]) ) {
        toTime = Long.parseLong(args[i+1]);
        i++;
      } else if( "-shards".equals(args[i]) ) {
        shardsCount = Integer.parseInt(args[i+1]);
        i++;
      }
    }

    String fields[] = { "received_date", "x_urls" };

    CodecProvider cp = CodecProvider.getDefault();
    cp.register( new YandexCodec() );
    cp.setDefaultFieldCodec( "Yandex" );

    for( int i = 0; i < shardsCount; i++ )
    {
	System.err.println( "Opening index: " + i );
	IndexReader reader = IndexReader.open(NIOFSDirectory.get(new File(index + "/" + i) ) );

        FieldSelector fs = new MapFieldSelector( fields );

	Bits deleted = MultiFields.getDeletedDocs(reader);

	HashSet<String> deduperSuper = new HashSet<String>();
	for( int d = 0; d < reader.maxDoc(); d++ )
	{
	Document doc;
	    if( deleted != null && deleted.get(d) ) continue;
	    try
	    {
		doc = reader.document( d, fs );
	    } catch( Exception e )
	    {
		continue;
	    }
	    String out = "";
	    String xurls = doc.get("x_urls");
	    if( xurls == null ) continue;
	    String receivedDateStr = doc.get("received_date");
	    if( receivedDateStr == null ) continue;
	    try
	    {
		int receivedDate = Integer.parseInt( receivedDateStr );
		if( receivedDate < fromTime || receivedDate > toTime ) continue;
		String[] lines = xurls.split( "\n" );
		deduperSuper.clear();
		for( int l = 0; l < lines.length; l++ )
		{
		    String line = HTMLEntities.unhtmlentities(URLDecoder.decode(lines[l],"utf8")).replace('\n',' ').trim();
		    if( line.startsWith("src=") || line.startsWith("mailto") || line.startsWith("file") || line.length() <= 3 ) continue;
		    if( deduperSuper.contains(line) ) continue;
		    deduperSuper.add( line );
		    System.out.println( line );
		}
	    }
	    catch( Exception e )
	    {
		continue;
	    }
	}
	reader.close();
    }
  }
}
