#!/usr/bin/perl -w

=head1 NAME

    mark_moves_in_diff

=head1 DESCRIPTION

    Первым параметром получаем имя diff-файла, выдаём в STDOUT его содержимое,
    при этом одинаковые куски кода будут помечены # chunk0023:32 (23 - номер уникального куска текста, 32 - количество строк)

=cut

use strict;
use warnings;

use Digest::MD5 qw/md5_base64/;

use Carp qw/croak/;
use Encode qw/encode_utf8 is_utf8/;

use open ':std' => ':utf8';
use utf8;

*h = sub {my $r = is_utf8($_[0]) ? encode_utf8($_[0]) : $_[0]; Digest::MD5::md5_base64($r)};

my $MIN_LINES = 3;
my $MAX_LINES = 300;
croak "Usage: $0 file.diff" if @ARGV != 1 || !-f $ARGV[0];

open(my $fh, "<:utf8", $ARGV[0]) || die "Can't open $ARGV[0]: $!";
my @text = map {chomp; $_} <$fh>;
close $fh;

# читаем диф, бъём на куски + и -
my $prev_type = '';
my @blocks; 
for(my $line = 0; $line < @text; $line++) {
    if (my ($type, $code) = $text[$line] =~ /^([+-])(|[^-+].*)$/) {
        $code =~ s/\s//g;
        if ($prev_type ne $type) {
            push @blocks, {code => [], line => $line, type => $type};
            $prev_type = $type;
        }
        push @{$blocks[-1]->{code}}, $code;
    } else {
        $prev_type = '';
    }
}

# считаем хэши всевозможных непрерывных кусков
my %BY_HASH;
my %BY_LINE;
for my $block (@blocks) {
    my @code = @{$block->{code}};
    for(my $i = 0; $i <= $#code - $MIN_LINES; $i++) {
        next if $code[$i] eq '';
        for(my $j = $i + $MIN_LINES; $j <= $#code && $j <= $i + $MAX_LINES; $j++) {
            my $hash_text = join "", @code[$i..$j];
            my $line = $block->{line} + $i;
            my $cnt = $j - $i + 1;
            my $hash = h($hash_text);
            next if $code[$j] eq '' || length($hash_text) / $cnt <= 2;
            unshift @{$BY_LINE{$line}}, [$hash, $cnt];
            push @{$BY_HASH{$hash}}, [$line, $cnt];
        }
    }
}

# находим повторяющиеся куски
my %choosed;
my $choosed_len = 0;
for(my $line = 0; $line < @text; $line++) {
    next unless $BY_LINE{$line};
    
    for my $h (@{$BY_LINE{$line}}) {
        for my $hh (@{$BY_HASH{$h->[0]}}) {
            if ($hh->[0] != $line && $hh->[1] > $choosed_len) {
                $choosed{$h->[0]} ||= (scalar keys %choosed)+1;
                $choosed_len = $hh->[1];
            }
        }
    }
    $choosed_len--;
}

# выводим текст дифа с пометками
my %delayed;
for(my $line = 0; $line < @text; $line++) {
    if ($BY_LINE{$line}) {
        for my $h (@{$BY_LINE{$line}}) {
            if (exists $choosed{$h->[0]}) {
                my $chunk_text = sprintf "chunk%04d:%d", $choosed{$h->[0]}, $h->[1];
                $delayed{$chunk_text} = $h->[1];
            }
        }
    }
    my @suf;
    for my $ch (sort keys %delayed) {
        push @suf, $ch;
        delete $delayed{$ch} unless --$delayed{$ch};
    }
    if (@suf) {
        $text[$line] =~ s/\t/    /g;
        $text[$line] =~ s/\s+$//;
        print $text[$line], " " x (160 - length($text[$line])), " # ", join(",", @suf), "\n";
    } else {
        print $text[$line], "\n";
    }
}
