#!/usr/bin/perl -w
#подключение счетчика запросов из cdict и их нормализация

use strict;

use utf8;
use open ":utf8";
use Data::Dumper;

use Digest::MD5 qw(md5 md5_hex md5_base64);
use Encode;

binmode STDIN, ':utf8';
binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';

use lib "/home/yuryz/arcadia/rt-research/broadmatching/scripts/lib";
use Project;

use Utils::Common;
use Project;
use BM::Phrase;
use BM::PhraseList;
use Time::HiRes qw(tv_interval gettimeofday);


my $proj = Project->new({load_dicts=>1, load_minicategs_light=>1});

my $worker = Utils::Worker->new;
$worker->{verbose}    = 1;
$worker->{num_processes}    = 12;

$worker->{file_input}       = $ARGV[0];
$worker->{file_output}      = $ARGV[1];

$worker->{process_line}     = sub {
    my ($line, $fh) = @_;
    chomp $line;

    my ($url, $position, $query, $clicks, $freq, $title) = split /\t/, $line;
    if ($query) {
        my $search_count = $proj->phrase($query)->get_search_count;
        #$search_count += 2; #сглаживание по Лапласу (Laplace/Additive smoothing)
        my $query_norm = $proj->phrase($query)->norm_phr;
        print $fh "$line\t$search_count\t$query_norm\n" if $search_count > 0;
    }
};

$worker->process_data;
