#!/usr/bin/perl

=head1 DEPLOY

# approved by pankovpv
# .migr
[
	{
		type => 'sql',
		webstop => "0",
		db => "ppc:all",
		when => 'before',
		time_estimate => "1 секунда",
		comment => "также следует выполнить в песочнице",
		sql => ["CREATE TABLE `bs_order_target_stat` (
					`OrderID` int(10) unsigned NOT NULL,
					`stat_date` date NOT NULL,
					`target_type` tinyint(3) unsigned NOT NULL,
					`shows` int(10) NOT NULL,
					`clicks` int(10) NOT NULL,
					`sum` decimal(16,6) NOT NULL,
	  				`sessions_num` int(10) NOT NULL DEFAULT '0',
					`sessions_len` int(10) NOT NULL DEFAULT '0',
	  				`goals_num` int(10) NOT NULL DEFAULT '0',
					PRIMARY KEY (`OrderID`,`stat_date`,`target_type`),
					KEY `stat_date` (`stat_date`)
				) ENGINE=InnoDB DEFAULT CHARSET=utf8
				/*!50100 PARTITION BY RANGE (YEAR(stat_date))
				(
					PARTITION p2007 VALUES LESS THAN (2008),
					PARTITION p2008 VALUES LESS THAN (2009),
					PARTITION p2009 VALUES LESS THAN (2010),
					PARTITION p2010 VALUES LESS THAN (2011),
					PARTITION p2011 VALUES LESS THAN (2012),
					PARTITION p2012 VALUES LESS THAN (2013),
					PARTITION p2013 VALUES LESS THAN (2014),
					PARTITION p2014 VALUES LESS THAN (2015),
					PARTITION p2015 VALUES LESS THAN (2016),
					PARTITION p2016 VALUES LESS THAN (2017),
					PARTITION p2017 VALUES LESS THAN (2018),
					PARTITION p2018 VALUES LESS THAN (2019),
					PARTITION p2019 VALUES LESS THAN (2020),
					PARTITION p2020 VALUES LESS THAN (2021),
					PARTITION p2021 VALUES LESS THAN (2022),
					PARTITION p2022 VALUES LESS THAN (2023),
					PARTITION p2023 VALUES LESS THAN (2024),
					PARTITION p2024 VALUES LESS THAN MAXVALUE
				) */",

				"CREATE TABLE `bs_order_stat_time` (
					`OrderID` int(11) unsigned NOT NULL,
					`stat_time` timestamp NOT NULL,
					PRIMARY KEY (`OrderID`)
				) ENGINE=InnoDB DEFAULT CHARSET=utf8",

				"CREATE TABLE `order_nds_discount` (
					`OrderID` int(10) unsigned NOT NULL,
					`date` date NOT NULL,
					`nds` decimal(6,4) unsigned NOT NULL,
					`discount` decimal(10,8) unsigned NOT NULL,
					`currency` char(3) NOT NULL,
					PRIMARY KEY (`OrderID`,`date`)
				) ENGINE=InnoDB DEFAULT CHARSET=utf8",

				"CREATE TABLE `currency_rates` (
					`currency` char(3) NOT NULL,
					`date` date NOT NULL,
					`rate` decimal(24,16) unsigned NOT NULL,
					PRIMARY KEY (`currency`,`date`)
				) ENGINE=InnoDB DEFAULT CHARSET=utf8",
		]
	},
	{
	  type => 'script',
	  when => 'after',
	  time_estimate => "2-3 дня",
	  comment => 'Запускать после подтверждения n-boy (нужно убедиться что файл со статистикой актуален, и лежит по указанному пути).

	  			  Втягиваем статистику в новую таблицу, за весь период (из старой таблицы, и из файла). 
	  			  Перед запуском выполнить scp root@ppcdev1:/tmp/campaigns_stat_since_20130401 /tmp/campaigns_stat_since_20130401
	  			  Если начнут сильно отставать реплики - можно перезапустить скрипт с параметром --sleep-coef 3	 (по умолчанию 1)

	  			  ПЕСОЧНИЦА: следует запустить с параметром --sandbox (работает пару минут)'
	}
]

=cut

use strict;
use warnings;
use utf8;

use FindBin qw/$Bin/;
use lib "$Bin/../protected/";
use my_inc "..";

use Settings;
use Yandex::DBTools;
use Yandex::DBShards;
use Yandex::TimeCommon qw/get_distinct_dates/;
use Yandex::ListUtils qw/xminus chunks/;
use Yandex::Retry qw/relaxed/;
use ScriptHelper;
use Tools;
use HashingTools qw/half_md5hex_hash/;
use DBStat;

use List::MoreUtils qw/all zip uniq any/;
use Digest::MD5 qw/md5_hex/;

my %O;
my $SLEEP = 1;
extract_script_params(
    'bs_order_target_stat_db' => \$O{bs_order_target_stat_db},
    'bs_order_target_stat_file' => \$O{bs_order_target_stat_file},
    'order_nds_discount' => \$O{order_nds_discount},
    'bs_order_stat_time' => \$O{bs_order_stat_time},
    'sandbox' => \$O{sandbox},
    'sleep-coef=i' => \$SLEEP,
) or die "can't parse options";
$O{all} = 1 unless any { defined $_ } values %O;

$log->out('START');

my $SELECT_LIMIT = 10_000;
my $ORDERS_CHUNK_SIZE = 500; # хотим выбирать до 100_000 записей за раз, медиана периода показа кампании - 71 день, считаем что показывается на поиске и РСЯ,
							 # тогда 71x2x500 = 70_000 записей статистики
my $LINES_CHUNK_SIZE = 100_000;
my $BORDER_DATE_OF_METRIKA_STAT = '2013-04-01';

#копируем статистику без данных метрики, до 2013-04-01, из ppcordstat.bs_order_target_stat
# 800_000 записей в минуту => порядка 19 часов
if ($O{bs_order_target_stat_db} || $O{all}) {
	$log->out("start copying statistics from ppcordstat.bs_order_target_stat");
	my ($order_id_min, $order_id_max) = get_one_line_array_sql(PPCORDSTAT, "select min(OrderID), max(OrderID) from bs_order_target_stat");
	my $order_id_start = $order_id_min;
	while ($order_id_start <= $order_id_max) {
		#меняем параметры подключения здесь, непосредственно перед запросом, на случай если коннект отвалится и самовосстановится
		get_dbh(PPCORDSTAT)->{mysql_use_result} = 1;
		get_dbh(PPCORDSTAT)->{AutoInactiveDestroy} = 1;

		my @stat_fields = qw/OrderID stat_date target_type shows clicks sum/;
		my $stat_fields_str = join ', ', @stat_fields;
		my $sth = exec_sql(PPCORDSTAT, "select $stat_fields_str 
										  from bs_order_target_stat 
										 where OrderID between ? and ? and stat_date < ?", 
										 	   $order_id_start, $order_id_start+$ORDERS_CHUNK_SIZE-1, $BORDER_DATE_OF_METRIKA_STAT);
		$log->out(sprintf('sql select executed for OrderID = %s .. %s of %s', $order_id_start, $order_id_start+$ORDERS_CHUNK_SIZE-1, $order_id_max));

		my $processed = 0;
		while (my $rows = $sth->fetchall_arrayref({}, $SELECT_LIMIT)) {
			foreach_shard OrderID => $rows, sub {
				my ($shard, $rows_sharded) = @_;
		    	relaxed times => $SLEEP, sub {
		    		do_mass_insert_sql(PPC(shard => $shard), "insert ignore into bs_order_target_stat ($stat_fields_str) values %s",
		    											 [	map {  [@{$_}{@stat_fields}] } 
		    											  	grep {
		    											  		any { $_ != 0 } @{$_}{ qw/shows clicks sum/ }
		    											  	} @$rows_sharded]);
		    	};
		    };

		    $processed += scalar(@$rows);
		    $log->out("$processed rows processed");

		}
		$sth->finish;

		$order_id_start += $ORDERS_CHUNK_SIZE;
	}
	$log->out("finish copying statistics from ppcordstat.bs_order_target_stat");
}

#втягиваем из файла статистику с данными метрики, начиная с 2013-04-01
# 150_000 записей в минуту => порядка 35 часов
if ($O{bs_order_target_stat_file} || $O{all}) {
	$log->out('start import file with stats & metrika data');
	my $stat_import = new BSStatImport(type => 'period');
	my $type_description = $stat_import->_type_description();

	open (my $fh, '<', '/tmp/campaigns_stat_since_20130401') or $log->die("Can't open file /tmp/campaigns_stat_since_20130401: $!");
	my $field_names_row = <$fh>;
	$log->die("Invalid line with field names: $field_names_row") unless $field_names_row =~ /^#([\w\t]+)$/;

	my @field_names = split /\t/, $1, -1;
	if ($type_description->{required_fields}) {
	    my $missing_fields = xminus($type_description->{required_fields}, \@field_names);
	    if ($missing_fields && @$missing_fields) {
	        my $missing_fields_str = join(', ', @$missing_fields);
	        $log->die("Fields $missing_fields_str are required but was not found in response header: $field_names_row");
	    }
	}

	my (@lines, $processed);
	while (my $line = <$fh>) {
		next unless $line =~ /\S+/;

		my @values = split /\t/, $line, scalar(@field_names);
		my %named_line = zip(@field_names, @values);

		my @non_zero_fields_group = qw/Shows  Clicks  Cost  CostCur  SessionNum  SessionDepth  GoalsNum/;
		foreach (@non_zero_fields_group) {
			$named_line{$_} = 0 if $named_line{$_} < 0;
		}

    	$stat_import->_check_named_row(\%named_line);
    	$stat_import->_apply_value_modifiers(\%named_line);

    	next unless any { $named_line{$_} != 0 } @non_zero_fields_group;

		push @lines, \%named_line;

		if (scalar(@lines) >= $SELECT_LIMIT) {
			process_lines_from_file(\@lines);
		    
		    $processed += scalar(@lines);
		    $log->out("$processed rows processed (file import)");

		    @lines = ();
		}
	}
	if (@lines) {
		process_lines_from_file(\@lines);

		$processed += scalar(@lines);
		$log->out("$processed rows processed (file import)");
	}
	$log->out('finish import file with stats & metrika data')
}

sub process_lines_from_file {
	my $lines = shift;
	
	foreach_shard OrderID => $lines, sub {	
		my ($shard, $sharded_lines) = @_;	
		my @order_ids = uniq map { $_->{OrderID} } @$lines;
		my $orders_data = DBStat::_get_orders_data(\@order_ids);
		my @lines_to_insert = ();
		foreach my $named_line (@$lines) {
			my $sum = (($orders_data->{$named_line->{OrderID}}->{currency} // '') ne 'YND_FIXED') ? $named_line->{CostCur} : $named_line->{Cost};
			push @lines_to_insert, [(map { $named_line->{$_} } qw/OrderID  UpdateTime  TargetType  Shows  Clicks  SessionNum  SessionDepth  GoalsNum/), $sum];
		}
		relaxed times => $SLEEP, sub {
			do_mass_insert_sql(PPC(shard => $shard), "insert ignore into bs_order_target_stat (OrderID, stat_date, target_type, shows, clicks, sessions_num, sessions_len, goals_num, sum) values %s",
												 	 \@lines_to_insert);
		};
	};
}

# копируем статистику в песочнице, из ppcordstat.bs_order_target_stat
# данные метрики генерируем на лету
# порядка 1 минуты
if ($O{sandbox}) {
	$log->out("start copying statistics from ppcordstat.bs_order_target_stat (generating metrika stats)");
	my $order_ids = get_one_column_sql(PPCORDSTAT, "select distinct OrderID from bs_order_target_stat");

	my $order_ids_have_goals = get_hash_sql(PPC(shard => 'all'), ['select c.OrderID, count(*) 
																	 from camp_metrika_goals g join campaigns c using(cid)',
																    where => {'c.OrderID' => $order_ids},
																   'group by c.OrderID']);

	get_dbh(PPCORDSTAT)->{mysql_use_result} = 1;
	get_dbh(PPCORDSTAT)->{AutoInactiveDestroy} = 1;

	my @stat_fields = qw/OrderID stat_date target_type shows clicks sum/;
	my @stat_metrika_fields = qw/sessions_num sessions_len goals_num/;
	my $stat_fields_str = join ', ', @stat_fields;
	my $stat_fields_with_metrika_str = join ', ', @stat_fields, @stat_metrika_fields;
	my $sth = exec_sql(PPCORDSTAT, "select $stat_fields_str from bs_order_target_stat");
	$log->out("sql select executed");

	my $processed = 0;
	while (my $rows = $sth->fetchall_arrayref({}, $SELECT_LIMIT)) {
		foreach my $row (@$rows) {
			my $md5_hash = half_md5hex_hash(md5_hex($row->{stat_date}, $row->{target_type}, $row->{OrderID}));
            
            $row->{sessions_num} = int($row->{clicks} * 0.9 ) || ($row->{clicks} ? 1 : 0);
            $row->{sessions_depth} = $row->{sessions_num} * (substr($md5_hash, 6, 1) % 5 + 1);

            my $session_clicks = $row->{clicks} - (substr($md5_hash, 7, 1) % 5 == 0 ? 1 : 0);
            my $conversion = $order_ids_have_goals->{$row->{OrderID}} ? substr($md5_hash, 8, 1) / 10 * 0.2 + 0.4 : 0;
            $row->{goals_num} = int($session_clicks * $conversion);
		}

		relaxed times => $SLEEP, sub {
			# в песочнице многих заказов нет в метабазе, а шард при этом только один, потому задаем шард жестко
    		do_mass_insert_sql(PPC(shard => 1), "insert ignore into bs_order_target_stat ($stat_fields_with_metrika_str) values %s",
    											 [map { [ @{$_}{ @stat_fields, @stat_metrika_fields } ] } @$rows]);
    	};

	    $processed += scalar(@$rows);
	    $log->out("$processed rows processed");

	}
	$sth->finish;

	$log->out("finish copying statistics from ppcordstat.bs_order_target_stat");
}


#копируем данные из ppcordstat.order_nds_discount в ppc.order_nds_discount
#порядка 1 часа (1.5 млн записей за 100 секунд)
if ($O{order_nds_discount} || $O{all} || $O{sandbox}) {
	$log->out('start copying data ppcordstat.order_nds_discount => ppc.order_nds_discount');
	my $order_ids = get_one_column_sql(PPCORDSTAT, "select distinct OrderID from order_nds_discount order by OrderID");
	$log->out('orders with nds/discount is fetched');

	my $iter = 0;
	for my $order_ids_chunk (chunks($order_ids, $ORDERS_CHUNK_SIZE)) {
		$iter ++;

		get_dbh(PPCORDSTAT)->{mysql_use_result} = 1;
		get_dbh(PPCORDSTAT)->{AutoInactiveDestroy} = 1;

		my @nds_fields = qw/OrderID date nds discount currency/;
		my $nds_fields_str = join ', ', @nds_fields;
		my $sth = exec_sql(PPCORDSTAT, "select $nds_fields_str
										  from order_nds_discount 
										 where OrderID between ? and ?", 
										 	   $order_ids_chunk->[0], $order_ids_chunk->[-1]);
		$log->out(sprintf('sql select executed for orders  %s .. %s out of %s', ($iter-1)*$ORDERS_CHUNK_SIZE, $iter*$ORDERS_CHUNK_SIZE-1, scalar(@$order_ids)));

		my $processed = 0;
		while (my $rows = $sth->fetchall_arrayref({}, $SELECT_LIMIT)) {
			foreach_shard OrderID => $rows, sub {
				my ($shard, $rows_sharded) = @_;
				relaxed times => $SLEEP, sub {
		    		do_mass_insert_sql(PPC(shard => $shard), "insert ignore into order_nds_discount ($nds_fields_str) values %s",
			    											  [map { [ @{$_}{ @nds_fields }] } @$rows_sharded]);
		    	};
		    };

		    $processed += scalar(@$rows);
		    $log->out("$processed rows processed");

		}
		$sth->finish;
	}
	$log->out('finish copying data ppcordstat.order_nds_discount => ppc.order_nds_discount');
}

#копируем данные из ppcordstat.bs_order_stat_time в ppc.bs_order_stat_time
#около 5 минут
if ($O{bs_order_stat_time} || $O{all}) {
	$log->out('start copying data ppcordstat.bs_order_stat_time => ppc.bs_order_stat_time');

	my $order_ids = get_one_column_sql(PPCORDSTAT, "select distinct OrderID from bs_order_stat_time order by OrderID");
	$log->out('orders from bs_order_stat_time is fetched');

	my $iter = 0;
	for my $order_ids_chunk (chunks($order_ids, $LINES_CHUNK_SIZE)) {
		$iter ++;
		#меняем параметры подключения здесь, непосредственно перед запросом, на случай если коннект отвалится и самовосстановится
		get_dbh(PPCORDSTAT)->{mysql_use_result} = 1;
		get_dbh(PPCORDSTAT)->{AutoInactiveDestroy} = 1;

		my @fields = qw/OrderID stat_time/;
		my $fields_str = join ', ', @fields;
		my $sth = exec_sql(PPCORDSTAT, "select $fields_str 
										  from bs_order_stat_time 
										 where OrderID between ? and ?", 
										 	   $order_ids_chunk->[0], $order_ids_chunk->[-1]);
		$log->out(sprintf('sql select executed for orders  %s .. %s out of %s', ($iter-1)*$LINES_CHUNK_SIZE, $iter*$LINES_CHUNK_SIZE-1, scalar(@$order_ids)));

		my $processed = 0;
		while (my $rows = $sth->fetchall_arrayref({}, $SELECT_LIMIT)) {
			foreach_shard OrderID => $rows, sub {
				my ($shard, $rows_sharded) = @_;
				relaxed times => $SLEEP, sub {
		    		do_mass_insert_sql(PPC(shard => $shard), "insert ignore into bs_order_stat_time ($fields_str) values %s",
				    											 [map { [ @{$_}{ @fields } ] } @$rows_sharded]);
		    	};
		    };

		    $processed += scalar(@$rows);
		    $log->out("$processed rows processed");

		}
		$sth->finish;
	}
	$log->out('finish copying data ppcordstat.bs_order_stat_time => ppc.bs_order_stat_time');
}

$log->out('FINISHED');
